In [1]:
import pandas as pd
import re
import string
import demoji
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree


In [2]:
# Load your dataset
df = pd.read_csv("twitter_sentiment_data.csv")

In [3]:
# change the column names to lowercase and remove all white spaces
df.columns = df.columns.str.lower().str.replace(' ', '')
df.head()

Unnamed: 0,sentiment,message,tweetid
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153


In [4]:
# Step 1: Remove Duplicates
df = df.drop_duplicates(subset=['message'])

In [5]:

# Step 2: Handling Retweets
df['is_retweet'] = df['message'].apply(lambda x: x.startswith('RT'))
#add retweet_username column
df['retweet_username'] = df['message'].apply(lambda x: re.findall(r'RT @\w+:', x))
# remove RT from message
df['message'] = df['message'].apply(lambda x: re.sub(r'RT', '', x))
#hande the case @username: make a new column for the username
df['username'] = df['message'].apply(lambda x: re.findall(r'@\w+', x))
#remove the username from the message
df['message'] = df['message'].apply(lambda x: re.sub(r'@\w+', '', x))


In [6]:
df.head()

Unnamed: 0,sentiment,message,tweetid,is_retweet,retweet_username,username
0,-1,climate change is an interesting hustle as it...,792927353886371840,False,[],[@tiniebeany]
1,1,": Watch #BeforeTheFlood right here, as trave...",793124211518832641,True,[RT @NatGeoChannel:],"[@NatGeoChannel, @LeoDiCaprio]"
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256,False,[],[@youtube]
3,1,: Just watched this amazing documentary by le...,793124635873275904,True,[RT @Mick_Fanning:],[@Mick_Fanning]
4,2,": Pranita Biswasi, a Lutheran from Odisha, gi...",793125156185137153,True,[RT @cnalive:],[@cnalive]


In [7]:
# Step 3: Text Cleaning
def clean_text(text):
    # Handle links separately
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', '', text)
    # Remove emojis
    text = demoji.replace_with_desc(text, sep=' ')
    # Remove special characters and numbers
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Convert to lowercase
    text = text.lower().strip()
    return text, links

df[['cleaned_message', 'links']] = df['message'].apply(clean_text).apply(pd.Series)
df.head()


Unnamed: 0,sentiment,message,tweetid,is_retweet,retweet_username,username,cleaned_message,links
0,-1,climate change is an interesting hustle as it...,792927353886371840,False,[],[@tiniebeany],climate change is an interesting hustle as it ...,[]
1,1,": Watch #BeforeTheFlood right here, as trave...",793124211518832641,True,[RT @NatGeoChannel:],"[@NatGeoChannel, @LeoDiCaprio]",watch beforetheflood right here as travels the...,[https://t.co/LkDehj3tNn]
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256,False,[],[@youtube],fabulous leonardo dicaprio s film on climate c...,[https://t.co/7rV6BrmxjW]
3,1,: Just watched this amazing documentary by le...,793124635873275904,True,[RT @Mick_Fanning:],[@Mick_Fanning],just watched this amazing documentary by leona...,[https://t.co/kNSTE8K8im]
4,2,": Pranita Biswasi, a Lutheran from Odisha, gi...",793125156185137153,True,[RT @cnalive:],[@cnalive],pranita biswasi a lutheran from odisha gives t...,[]


In [8]:
# Step 4: Expand Contractions
df['expanded_message'] = df['cleaned_message'].apply(lambda x: contractions.fix(x))


In [9]:

# Step 5: Advanced Tokenization
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
df['tokenized_message'] = df['expanded_message'].apply(tokenizer.tokenize)


In [10]:
# Step 6: Remove Stopwords and Punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
df['tokenized_message'] = df['tokenized_message'].apply(lambda x: [word for word in x if word not in stop_words and word not in punctuation])


In [11]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')

#  Stemming
stemmer = PorterStemmer()
df['stemmed_message'] = df['tokenized_message'].apply(lambda x: [stemmer.stem(word) for word in x])

# Lemmatization
lemmatizer = WordNetLemmatizer()
df['lemmatized_message'] = df['tokenized_message'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nirad\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Step 7: Part-of-Speech Tagging and Named Entity Recognition
import nltk
nltk.download('averaged_perceptron_tagger')

def preprocess_pos_ner(tokens):
    pos_tags = pos_tag(tokens)
    # Replace named entities with a common tag
    pos_tags = [(tag[0], 'NE' if tag[1] == 'NNP' else tag[1]) for tag in pos_tags]
    return pos_tags

df['pos_tags'] = df['tokenized_message'].apply(preprocess_pos_ner)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nirad\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [13]:
# Display the preprocessed dataframe
df.head()

Unnamed: 0,sentiment,message,tweetid,is_retweet,retweet_username,username,cleaned_message,links,expanded_message,tokenized_message,stemmed_message,lemmatized_message,pos_tags
0,-1,climate change is an interesting hustle as it...,792927353886371840,False,[],[@tiniebeany],climate change is an interesting hustle as it ...,[],climate change is an interesting hustle as it ...,"[climate, change, interesting, hustle, global,...","[climat, chang, interest, hustl, global, warm,...","[climate, change, interesting, hustle, global,...","[(climate, NN), (change, NN), (interesting, VB..."
1,1,": Watch #BeforeTheFlood right here, as trave...",793124211518832641,True,[RT @NatGeoChannel:],"[@NatGeoChannel, @LeoDiCaprio]",watch beforetheflood right here as travels the...,[https://t.co/LkDehj3tNn],watch beforetheflood right here as travels the...,"[watch, beforetheflood, right, travels, world,...","[watch, beforetheflood, right, travel, world, ...","[watch, beforetheflood, right, travel, world, ...","[(watch, NN), (beforetheflood, NN), (right, RB..."
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256,False,[],[@youtube],fabulous leonardo dicaprio s film on climate c...,[https://t.co/7rV6BrmxjW],fabulous leonardo dicaprio s film on climate c...,"[fabulous, leonardo, dicaprio, film, climate, ...","[fabul, leonardo, dicaprio, film, climat, chan...","[fabulous, leonardo, dicaprio, film, climate, ...","[(fabulous, JJ), (leonardo, NN), (dicaprio, NN..."
3,1,: Just watched this amazing documentary by le...,793124635873275904,True,[RT @Mick_Fanning:],[@Mick_Fanning],just watched this amazing documentary by leona...,[https://t.co/kNSTE8K8im],just watched this amazing documentary by leona...,"[watched, amazing, documentary, leonardodicapr...","[watch, amaz, documentari, leonardodicaprio, c...","[watched, amazing, documentary, leonardodicapr...","[(watched, VBN), (amazing, JJ), (documentary, ..."
4,2,": Pranita Biswasi, a Lutheran from Odisha, gi...",793125156185137153,True,[RT @cnalive:],[@cnalive],pranita biswasi a lutheran from odisha gives t...,[],pranita biswasi a lutheran from odisha gives t...,"[pranita, biswasi, lutheran, odisha, gives, te...","[pranita, biswasi, lutheran, odisha, give, tes...","[pranita, biswasi, lutheran, odisha, give, tes...","[(pranita, JJ), (biswasi, NN), (lutheran, NN),..."


In [14]:
df.to_csv('preprocessed_twitter_sentiment_data.csv', index=False)