In [1]:
import pandas as pd
import re
import string
import demoji
import contractions
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk import pos_tag, ne_chunk
from nltk.tree import Tree


In [2]:
# Load your dataset
df = pd.read_csv("twitter_sentiment_data.csv")

In [3]:
# Step 1: Remove Duplicates
df = df.drop_duplicates(subset=['message'])

In [4]:

# Step 2: Handling Retweets
df['is_retweet'] = df['message'].apply(lambda x: x.startswith('RT'))


In [5]:
# Step 3: Text Cleaning
def clean_text(text):
    # Handle links separately
    links = re.findall(r'http\S+', text)
    text = re.sub(r'http\S+', 'link_url', text)
    # Remove emojis
    text = demoji.replace_with_desc(text, sep=' ')
    # Remove special characters and numbers
    text = re.sub('[^A-Za-z]+', ' ', text)
    # Convert to lowercase
    text = text.lower().strip()
    return text, links

df[['cleaned_message', 'links']] = df['message'].apply(clean_text).apply(pd.Series)


In [6]:
# Step 4: Expand Contractions
df['expanded_message'] = df['cleaned_message'].apply(lambda x: contractions.fix(x))


In [7]:

# Step 5: Advanced Tokenization
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
df['tokenized_message'] = df['expanded_message'].apply(tokenizer.tokenize)


In [8]:
# Step 6: Remove Stopwords and Punctuation
stop_words = set(stopwords.words('english'))
punctuation = set(string.punctuation)
df['tokenized_message'] = df['tokenized_message'].apply(lambda x: [word for word in x if word not in stop_words and word not in punctuation])


In [10]:
# Step 7: Part-of-Speech Tagging and Named Entity Recognition
import nltk
nltk.download('averaged_perceptron_tagger')

def preprocess_pos_ner(tokens):
    pos_tags = pos_tag(tokens)
    # Replace named entities with a common tag
    pos_tags = [(tag[0], 'NE' if tag[1] == 'NNP' else tag[1]) for tag in pos_tags]
    return pos_tags

df['pos_tags'] = df['tokenized_message'].apply(preprocess_pos_ner)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nirad\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [12]:
# Display the preprocessed dataframe
df.head()

Unnamed: 0,sentiment,message,tweetid,is_retweet,cleaned_message,links,expanded_message,tokenized_message,pos_tags
0,-1,@tiniebeany climate change is an interesting h...,792927353886371840,False,tiniebeany climate change is an interesting hu...,[],tiniebeany climate change is an interesting hu...,"[tiniebeany, climate, change, interesting, hus...","[(tiniebeany, JJ), (climate, NN), (change, NN)..."
1,1,RT @NatGeoChannel: Watch #BeforeTheFlood right...,793124211518832641,True,rt natgeochannel watch beforetheflood right he...,[https://t.co/LkDehj3tNn],rt natgeochannel watch beforetheflood right he...,"[rt, natgeochannel, watch, beforetheflood, rig...","[(rt, JJ), (natgeochannel, NNS), (watch, VBP),..."
2,1,Fabulous! Leonardo #DiCaprio's film on #climat...,793124402388832256,False,fabulous leonardo dicaprio s film on climate c...,[https://t.co/7rV6BrmxjW],fabulous leonardo dicaprio s film on climate c...,"[fabulous, leonardo, dicaprio, film, climate, ...","[(fabulous, JJ), (leonardo, NN), (dicaprio, NN..."
3,1,RT @Mick_Fanning: Just watched this amazing do...,793124635873275904,True,rt mick fanning just watched this amazing docu...,[https://t.co/kNSTE8K8im],rt mick fanning just watched this amazing docu...,"[rt, mick, fanning, watched, amazing, document...","[(rt, NN), (mick, NN), (fanning, VBG), (watche..."
4,2,"RT @cnalive: Pranita Biswasi, a Lutheran from ...",793125156185137153,True,rt cnalive pranita biswasi a lutheran from odi...,[],rt cnalive pranita biswasi a lutheran from odi...,"[rt, cnalive, pranita, biswasi, lutheran, odis...","[(rt, NN), (cnalive, JJ), (pranita, NN), (bisw..."
