In [None]:
# Load Data
import pandas as pd

# Load raw tweets
tweets_df = pd.read_csv("../data/raw/twitter_training.csv", names=['sentiment', 'text'])
tweets_df.head(200)


Unnamed: 0,Unnamed: 1,sentiment,text
2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2401,Borderlands,Positive,im coming on borderlands and i will murder you...
2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
...,...,...,...
2433,Borderlands,Neutral,i enter that gunner seat and actually fear for...
2433,Borderlands,Neutral,i then enter in that gunner seat and i fear fo...
2433,Borderlands,Neutral,i enter that gunner seat and i fear for a life
2434,Borderlands,Negative,fuck it . pic.twitter.com/Wav1bacr5j


In [None]:
# Data Cleaning
import re
import emoji
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to clean tweets
def clean_tweet(text):
    if pd.isnull(text):
        return ""
    
    # Remove emojis
    text = emoji.replace_emoji(text, replace='')

    # Remove URLs
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)

    # Remove user @ references and hashtags
    text = re.sub(r'\@\w+|\#', '', text)

    # Remove punctuation and numbers
    text = re.sub(r"[^A-Za-z\s]", '', text)

    # Tokenize
    tokens = word_tokenize(text.lower())

    # Remove stopwords and non-alphabetic words
    cleaned_tokens = [word for word in tokens if word.isalpha() and word not in stop_words]

    # Join tokens back into one string
    cleaned_text = " ".join(cleaned_tokens)

    return cleaned_text


# Clean the tweets
tweets_df['cleaned_text'] = tweets_df['text'].apply(clean_tweet)

# View the cleaned tweets
print(tweets_df[['sentiment', 'cleaned_text']].head(200))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\moshi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\moshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\moshi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


                 sentiment                          cleaned_text
2401 Borderlands  Positive         im getting borderlands murder
     Borderlands  Positive                   coming borders kill
     Borderlands  Positive           im getting borderlands kill
     Borderlands  Positive          im coming borderlands murder
     Borderlands  Positive         im getting borderlands murder
...                    ...                                   ...
2433 Borderlands   Neutral  enter gunner seat actually fear life
     Borderlands   Neutral           enter gunner seat fear life
     Borderlands   Neutral           enter gunner seat fear life
2434 Borderlands  Negative            fuck pictwittercomwavbacrj
     Borderlands  Negative           fuck pictwittercom wavbacrj

[200 rows x 2 columns]


In [7]:
# Handling Missing Values
tweets_df.dropna(subset=['cleaned_text', 'sentiment'], inplace=True)

In [8]:
# Save Processed Data
tweets_df.to_csv("../data/processed/cleaned_tweets.csv", index=False)
