<a href="https://colab.research.google.com/github/pikaduck/twt-sentiment-analysis/blob/main/twt_sentiment_analysis_eda.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import string
import pandas as pd
import nltk

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

import dask.dataframe as ddf
import emoji

nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
! pip install emoji
! pip install dask

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
# read train data & insert column names

df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/training.1600000.processed.noemoticon.csv', encoding_errors = 'replace', header = None) 
df_train = df_train.rename(columns = {0 : 'sentiment', 1 : 'idx', 2 : 'timestamp', 3 : 'device', 4 : 'username', 5 : 'tweet'})

df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/testdata.manual.2009.06.14.csv')

In [None]:
# find tweet per username count and sort. check for any pattern/trend/cleaning cues

grouped_df = df_train.groupby(by=['username']).count()['tweet']
grouped_df.sort_values(ascending = False)

username
lost_dog           549
webwoke            345
tweetpet           310
SallytheShizzle    281
VioletsCRUK        279
                  ... 
artgantuan           1
artfulife            1
artful_Roger         1
artfromscotland      1
zzzzeus111           1
Name: tweet, Length: 659775, dtype: int64

In [None]:
# same tweet text tweeted with variation in twt handles mentioned

df_lost_dog = df_train.loc[df_train['username'] == 'lost_dog']
df_lost_dog

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet
43935,0,1676704158,Fri May 01 22:54:02 PDT 2009,NO_QUERY,lost_dog,@NyleW I am lost. Please help me find a good h...
45574,0,1677189389,Sat May 02 00:51:35 PDT 2009,NO_QUERY,lost_dog,@SallyD I am lost. Please help me find a good ...
46919,0,1677519173,Sat May 02 02:30:50 PDT 2009,NO_QUERY,lost_dog,@zuppaholic I am lost. Please help me find a g...
47949,0,1677752995,Sat May 02 03:47:51 PDT 2009,NO_QUERY,lost_dog,@LOSTPETUSA I am lost. Please help me find a g...
50572,0,1678544903,Sat May 02 07:02:28 PDT 2009,NO_QUERY,lost_dog,@JeanLevertHood I am lost. Please help me find...
...,...,...,...,...,...,...
792409,0,2326272045,Thu Jun 25 06:48:18 PDT 2009,NO_QUERY,lost_dog,@trooppetrie I am lost. Please help me find a ...
793314,0,2326588770,Thu Jun 25 07:14:42 PDT 2009,NO_QUERY,lost_dog,@Carly_FTS I am lost. Please help me find a go...
793610,0,2326689658,Thu Jun 25 07:22:51 PDT 2009,NO_QUERY,lost_dog,@inathlone I am lost. Please help me find a go...
798608,0,2328636087,Thu Jun 25 09:49:04 PDT 2009,NO_QUERY,lost_dog,@Kram I am lost. Please help me find a good ho...


In [None]:
df_train.shape

(1600000, 6)

In [None]:
df_train.head(5)

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
def clean_text(text):
  text = re.sub(r'[\.]+', '.', text)
  # print(text)
  text = re.sub(r'[\!]+', '!', text)
  # print(text)
  text = re.sub(r'[\?]+', '!', text)
  # print(text)
  text = re.sub(r'\s+', ' ', text).strip().lower()
  # print(text)
  text = re.sub(r'@\w+', '', text).strip().lower()
  # print(text)
  text = re.sub(r'\s[n]+[o]+', ' no', text)
  # print(text)
  text = re.sub(r'n\'t', 'n not', text)
  # print(text)
  text = re.sub(r'\'nt', 'n not', text)
  # print(text)
  text = re.sub(r'\'re', ' are', text)
  # print(text)
  text = re.sub(r'\'s', ' is', text)
  # print(text)
  text = re.sub(r'\'d', ' would', text)
  # print(text)
  text = re.sub(r'\'ll', ' will', text)
  # print(text)
  text = re.sub(r'\'ve', ' have', text)
  # print(text)
  text = re.sub(r'\'m', ' am', text)
  # print(text)
  # map variations of nope to no
  text = re.sub(r'\s[n]+[o]+[p]+[e]+', ' no', text)
  # print(text)
  # clean websites mentioned in text
  text = re.sub(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%|\~)*\b', '', text, flags=re.MULTILINE).strip()
  # print(text)
  text = re.sub(r'(www.)(\w|\.|\/|\?|\=|\&|\%)*\b', '', text, flags=re.MULTILINE).strip()
  # print(text)
  text = re.sub(r'\w+.com', '', text).strip()
  # print(text)
  text = emoji.demojize(text)
  return text

print(df_train.shape)
df_train['cleaned_tweet'] = df_train['tweet'].apply(clean_text)
# dedup considering username & cleaned tweet+sentiment
df_train = df_train.drop_duplicates(subset = ['username', 'cleaned_tweet', 'sentiment'])
print(df_train.shape)

df_test['cleaned_tweet'] = df_test['tweet'].apply(clean_text)

(1523316, 10)
(1523316, 10)


In [None]:
df_train

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet,cleaned_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that is a bummer. you shoulda got davi..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can not update his facebook b...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it is not behaving at all. i am mad. why a..."
...,...,...,...,...,...,...,...
1599995,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...
1599996,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,- very cool to hear old walt interviews! ♫
1599997,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover! ask me f...
1599998,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time! tu...


In [None]:
# dedup with cleaned tweet + sentiment now, we will ignore the username of the tweet as a feature for now & just use text to determine sentiment
# it should be noted that the username can also be a useful feature that helps in the way that what is the general tone of the tweets tweeted by an xyz username

df_train = df_train.drop_duplicates(subset=['cleaned_tweet', 'sentiment'])

df_train.shape

(1541139, 7)

In [None]:
def remove_punctuation(text):
  translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
  text = text.translate(translator)
  return re.sub(r'\s+', ' ', text).strip()

def remove_numbers(text):
  return re.sub(r'[0-9]+', '', text)

print(df_train.shape)
df_train['numbers_cleaned_tweet'] = df_train['cleaned_tweet'].apply(remove_numbers)
print(df_train.shape)

df_train['punc_cleaned_tweet'] = df_train['numbers_cleaned_tweet'].apply(remove_punctuation)
df_train = df_train.drop_duplicates(subset = ['punc_cleaned_tweet', 'sentiment'])
print(df_train.shape)

df_test['numbers_cleaned_tweet'] = df_test['cleaned_tweet'].apply(remove_numbers)
df_test['punc_cleaned_tweet'] = df_test['numbers_cleaned_tweet'].apply(remove_punctuation)

(1523316, 10)
(1523316, 10)
(1523316, 10)


In [None]:
grouped_df = df_train.groupby(by=['username']).count()['punc_cleaned_tweet']
grouped_df.sort_values(ascending = False)

username
SallytheShizzle    275
VioletsCRUK        274
mcraddictal        256
tsarnick           246
keza34             217
                  ... 
MadisonKeesler       1
frantescarolli       1
frantattack          1
MadisonLindley       1
zzzzeus111           1
Name: punc_cleaned_tweet, Length: 641490, dtype: int64

In [None]:
# clean stopwords from tweets but retain negative words
stops = stopwords.words('english')

# remove all negatives that are included in the nltk stopwods list
negatives = ['no','nor','not','ain','aren',"aren't",'couldn',"couldn't",'didn',"didn't",'doesn',"doesn't",'hadn',"hadn't",'hasn',
  "hasn't",'haven',"haven't",'isn',"isn't",'mightn',"mightn't",'mustn',"mustn't",'needn',"needn't",'shan',"shan't",'shouldn',"shouldn't",
  'wasn',"wasn't",'weren',"weren't","won't",'wouldn',"wouldn't",'don',"don't"]

stops = [stop for stop in stops if stop not in negatives]

# convert list to set for faster look-up with O(1) complexity as opposed to list with O(N) complexity
stops = set(stops)

In [None]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return lemmatizer.lemmatize(text)

In [None]:
# load pandas df into a dask df for fast and parallel stop-words removal in the huge dataframe

import dask.dataframe as ddf

# typically partitions = no. of cpu cores, but we are setting them to 10 here
ddf_train = ddf.from_pandas(df_train, npartitions = 10)

In [None]:
# remove stopwords

def remove_stopwords(text):
  tokens = word_tokenize(text)
  tokens = [token.strip() for token in tokens if token.strip() not in stops]
  tokens = [lemmatizer.lemmatize(token) for token in tokens]
  return ' '.join(tokens)

def clean(df):
  df['lemmatized_stop_removed_tweet'] = df['punc_cleaned_tweet'].map(remove_stopwords)
  return df

stops = set(stops)
df_train['lemmatized_stop_removed_tweet'] = [''] * len(df_train)
result = ddf_train.map_partitions(clean, meta = df_train)
df_train = result.compute()

df_test['lemmatized_stop_removed_tweet'] = df_test['punc_cleaned_tweet'].apply(remove_stopwords)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lemmatized_stop_removed_tweet'] = df['punc_cleaned_tweet'].map(remove_stopwords)


In [None]:
type(df_train)

pandas.core.frame.DataFrame

In [None]:
df_train

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet,cleaned_tweet,numbers_cleaned_tweet,punc_cleaned_tweet,lemmatized_stop_removed_tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that is a bummer. you shoulda got davi...","- awww, that is a bummer. you shoulda got davi...",awww that is a bummer you shoulda got david ca...,awww bummer shoulda got david carr third day
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,is upset that he can not update his facebook b...,upset not update facebook texting might cry re...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,i dived many times for the ball. managed to sa...,i dived many times for the ball managed to sav...,dived many time ball managed save rest go bound
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,whole body feel itchy like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....","no, it is not behaving at all. i am mad. why a...","no, it is not behaving at all. i am mad. why a...",no it is not behaving at all i am mad why am i...,no not behaving mad not see
...,...,...,...,...,...,...,...,...,...,...
1523311,4,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...,just woke up. having no school is the best fee...,just woke up. having no school is the best fee...,just woke up having no school is the best feel...,woke no school best feeling ever
1523312,4,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,- very cool to hear old walt interviews! ♫,- very cool to hear old walt interviews! ♫,very cool to hear old walt interviews ♫,cool hear old walt interview ♫
1523313,4,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,are you ready for your mojo makeover! ask me f...,are you ready for your mojo makeover! ask me f...,are you ready for your mojo makeover ask me fo...,ready mojo makeover ask detail
1523314,4,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,happy 38th birthday to my boo of alll time! tu...,happy th birthday to my boo of alll time! tupa...,happy th birthday to my boo of alll time tupac...,happy th birthday boo alll time tupac amaru sh...


In [None]:
df_test

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet,cleaned_tweet,numbers_cleaned_tweet,punc_cleaned_tweet,lemmatized_stop_removed_tweet
0,4,3,Mon May 11 03:17:40 UTC 2009,kindle2,tpryan,@stellargirl I loooooooovvvvvveee my Kindle2. ...,i loooooooovvvvvveee my kindle2. not that the ...,i loooooooovvvvvveee my kindle. not that the d...,i loooooooovvvvvveee my kindle not that the dx...,loooooooovvvvvveee kindle not dx cool fantasti...
1,4,4,Mon May 11 03:18:03 UTC 2009,kindle2,vcu451,Reading my kindle2... Love it... Lee childs i...,reading my kindle2. love it. lee childs is goo...,reading my kindle. love it. lee childs is good...,reading my kindle love it lee childs is good read,reading kindle love lee child good read
2,4,5,Mon May 11 03:18:54 UTC 2009,kindle2,chadfu,"Ok, first assesment of the #kindle2 ...it fuck...","ok, first assesment of the #kindle2 .it fuckin...","ok, first assesment of the #kindle .it fucking...",ok first assesment of the kindle it fucking rocks,ok first assesment kindle fucking rock
3,4,6,Mon May 11 03:19:04 UTC 2009,kindle2,SIX15,@kenburbary You'll love your Kindle2. I've had...,you will love your kindle2. i have had mine fo...,you will love your kindle. i have had mine for...,you will love your kindle i have had mine for ...,love kindle mine month never looked back new b...
4,4,7,Mon May 11 03:21:41 UTC 2009,kindle2,yamarama,@mikefish Fair enough. But i have the Kindle2...,fair enough. but i have the kindle2 and i thin...,fair enough. but i have the kindle and i think...,fair enough but i have the kindle and i think ...,fair enough kindle think perfect
...,...,...,...,...,...,...,...,...,...,...
493,2,14072,Sun Jun 14 04:31:43 UTC 2009,latex,proggit,Ask Programming: LaTeX or InDesign?: submitted...,ask programming: latex or indesign!: submitted...,ask programming: latex or indesign!: submitted...,ask programming latex or indesign submitted by...,ask programming latex indesign submitted calci...
494,0,14073,Sun Jun 14 04:32:17 UTC 2009,latex,sam33r,"On that note, I hate Word. I hate Pages. I hat...","on that note, i hate word. i hate pages. i hat...","on that note, i hate word. i hate pages. i hat...",on that note i hate word i hate pages i hate l...,note hate word hate page hate latex said hate ...
495,4,14074,Sun Jun 14 04:36:34 UTC 2009,latex,iamtheonlyjosie,Ahhh... back in a *real* text editing environm...,ahhh. back in a *real* text editing environmen...,ahhh. back in a *real* text editing environmen...,ahhh back in a real text editing environment i...,ahhh back real text editing environment lt latex
496,0,14075,Sun Jun 14 21:36:07 UTC 2009,iran,plutopup7,"Trouble in Iran, I see. Hmm. Iran. Iran so far...","trouble in iran, i see. hmm. iran. iran so far...","trouble in iran, i see. hmm. iran. iran so far...",trouble in iran i see hmm iran iran so far awa...,trouble iran see hmm iran iran far away flocko...


In [None]:
# save the pandas df returned after computing on dask result
df_train.to_csv('/content/drive/MyDrive/Colab Notebooks/train.csv', index = False)
df_test.to_csv('/content/drive/MyDrive/Colab Notebooks/test.csv', index = False)

In [None]:
import pandas as pd

df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

In [None]:
# get lengths of cleaned tweets to make sure tweets of suitable length are selected in training & validation

df_train['tweet_len'] = df_train['lemmatized_stop_removed_tweet'].str.len()

In [None]:
# check the unique twitter users in dataset

df_train['username'].nunique()

641490

In [None]:
# get mean, median, mode of lengths of tweets in train dist

df_train['tweet_len'].mean(), df_train['tweet_len'].median(), df_train['tweet_len'].mode()

(43.37706460538497, 40.0, 0    28.0
 dtype: float64)

In [None]:
# test has label=2 as a class which was absent in training data, thus it is being stored in a separate df

df_test_2 = df_test.loc[df_test['sentiment'] ==  2]
df_test = df_test.loc[df_test['sentiment'] != 2]

In [None]:
# separate dataframes for both labels respectively to be able to choose samples in training data in accordance to their
# respective mean/median/mode

df_train0 = df_train.loc[df_train['sentiment'] == 0]
df_train4 = df_train.loc[df_train['sentiment'] == 4]

print(df_train0['tweet_len'].mean(), df_train0['tweet_len'].median(), df_train0['tweet_len'].mode())
print(df_train4['tweet_len'].mean(), df_train4['tweet_len'].median(), df_train4['tweet_len'].mode())

44.40802021046035 41.0 0    29.0
dtype: float64
42.332499957008366 39.0 0    24.0
dtype: float64


In [None]:
# get dfs for tweets < and > median length to make sure equal samples are picked from both dists

df_train0_below_median = df_train0.loc[df_train0['tweet_len'] < df_train0['tweet_len'].median()]
df_train0_greater_median = df_train0.loc[df_train0['tweet_len'] > df_train0['tweet_len'].median()]

df_train4_below_median = df_train4.loc[df_train4['tweet_len'] < df_train4['tweet_len'].median()]
df_train4_greater_median = df_train4.loc[df_train4['tweet_len'] > df_train4['tweet_len'].median()]

df_train0_below_median.shape, df_train0_greater_median.shape, df_train4_below_median.shape, df_train4_greater_median.shape

((375812, 11), (378915, 11), (375423, 11), (369510, 11))

In [None]:
# dedup with username to make sure that any tweeting styles do not get more representation in subsetted dataset

df_train0_below_median = df_train0.drop_duplicates(subset = ['username'])
df_train0_greater_median = df_train0_greater_median.drop_duplicates(subset = ['username'])

df_train4_below_median = df_train4.drop_duplicates(subset = ['username'])
df_train4_greater_median = df_train4_greater_median.drop_duplicates(subset = ['username'])

df_train0_below_median.shape, df_train0_greater_median.shape, df_train4_below_median.shape, df_train4_greater_median.shape

((403952, 11), (233376, 11), (365120, 11), (214327, 11))

In [None]:
# select 500 samples from each & concat them into a single df

df_train0_below_median = df_train0_below_median.sample(frac = 1).iloc[:500]
df_train0_greater_median = df_train0_greater_median.sample(frac = 1).iloc[:500]

df_train4_below_median = df_train4_below_median.sample(frac = 1).iloc[:500]
df_train4_greater_median = df_train4_greater_median.sample(frac = 1).iloc[:500]

df_train = pd.concat([df_train0_below_median, df_train0_greater_median, df_train4_below_median, df_train4_greater_median])
df_train

Unnamed: 0,sentiment,idx,timestamp,device,username,tweet,cleaned_tweet,numbers_cleaned_tweet,punc_cleaned_tweet,lemmatized_stop_removed_tweet,tweet_len
706972,0,2265223671,Sun Jun 21 06:05:13 PDT 2009,NO_QUERY,candiceiona,I haven't been this sick in a long time,i haven not been this sick in a long time,i haven not been this sick in a long time,i haven not been this sick in a long time,haven not sick long time,24.0
707345,0,2265307947,Sun Jun 21 06:17:09 PDT 2009,NO_QUERY,amyndowdall,really miss acting !!,really miss acting !,really miss acting !,really miss acting,really miss acting,18.0
555389,0,2212891198,Wed Jun 17 14:40:05 PDT 2009,NO_QUERY,emflowers,@bigmcc the spare set of keys did look guilty ...,the spare set of keys did look guilty when i t...,the spare set of keys did look guilty when i t...,the spare set of keys did look guilty when i t...,spare set key look guilty took box knew helping,47.0
592122,0,2226741895,Thu Jun 18 12:19:43 PDT 2009,NO_QUERY,bamarkey,my meeting just got bumped back to 6... that m...,my meeting just got bumped back to 6. that may...,my meeting just got bumped back to . that may ...,my meeting just got bumped back to that may pu...,meeting got bumped back may put glitch plan to...,62.0
428351,0,2068040137,Sun Jun 07 13:23:42 PDT 2009,NO_QUERY,btb_ndp3,So bored and sleepy. On my way to houston gonn...,so bored and sleepy. on my way to houston gonn...,so bored and sleepy. on my way to houston gonn...,so bored and sleepy on my way to houston gonna...,bored sleepy way houston gon na miss,36.0
...,...,...,...,...,...,...,...,...,...,...,...
1169169,4,1990349885,Mon Jun 01 04:37:30 PDT 2009,NO_QUERY,jawsee,"At some point, today will be over &amp; I won'...","at some point, today will be over &amp; i won ...","at some point, today will be over &amp; i won ...",at some point today will be over amp i won not...,point today amp not stress algebra anymore,42.0
1516267,4,2191753493,Tue Jun 16 05:54:29 PDT 2009,NO_QUERY,Stephaniecasa,"@jonrobert haha, , I think I have already tho ...","haha, , i think i have already tho . i swaer i...","haha, , i think i have already tho . i swaer i...",haha i think i have already tho i swaer i have...,haha think already tho swaer herd song radio,44.0
916850,4,1825262740,Sun May 17 05:01:45 PDT 2009,NO_QUERY,breza_juanita,"@gryffindorap Orals are crappy, but my modus o...","orals are crappy, but my modus operandi is 'va...","orals are crappy, but my modus operandi is 'va...",orals are crappy but my modus operandi is vagu...,oral crappy modus operandi vaguely know subjec...,69.0
1050194,4,1970403733,Sat May 30 03:43:56 PDT 2009,NO_QUERY,sarahmaree,@motherfuckinwar well if I ever get mine toget...,well if i ever get mine together you would be ...,well if i ever get mine together you would be ...,well if i ever get mine together you would be ...,well ever get mine together would e manage,42.0


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay

In [None]:
# some tweets apparently had only stop-words and they were reduced to strings of len=0 in the process of cleaning, these need to be removed
df_train = df_train.dropna(subset = ['lemmatized_stop_removed_tweet'])
df_test = df_test.dropna(subset = ['lemmatized_stop_removed_tweet'])

count_vectorizer = CountVectorizer()
tfidf = TfidfTransformer(use_idf = True, norm = 'l2', smooth_idf = True)

X_train = count_vectorizer.fit_transform(df_train['lemmatized_stop_removed_tweet']).toarray()
X_train = tfidf.fit_transform(X_train).toarray()

# it is a binary classification problem since 2 classes exist, so mapping 4 to 1 would make sense
y_train = df_train['sentiment'].replace([4], [1]).to_numpy()

X_test = count_vectorizer.transform(df_test['lemmatized_stop_removed_tweet']).toarray()
X_test = tfidf.transform(X_test).toarray()

y_test = df_test['sentiment'].replace([4], [1]).to_numpy()

In [None]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((1999, 5238), (1999,), (359, 5238), (359,))

In [None]:
import os
import json

def train(model, X_train, y_train):
  return model.fit(X_train, y_train)

def save_model(model, dirpath, model_name):
  with open(os.path.join(dirpath, model_name + '.sav'), 'wb') as f:
    pickle.dump(model, f)
  f.close()

def make_predictions(model, X_test, y_test, dirpath, model_name):
  predictions = model.predict(X_test)

  cm = confusion_matrix(y_test, predictions)
  clf_report = classification_report(y_test, predictions, output_dict = True)
  acc = accuracy_score(y_test, predictions)

  print(cm)
  print(clf_report)
  print(acc)

  with open(os.path.join(dirpath, model_name + '.report'), 'w') as f:
    json.dump(clf_report, f, indent = 4)
  f.close()

In [None]:
dirpath = '/content/drive/MyDrive/Colab Notebooks/models'

In [None]:
# train logistic regression model

from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model = train(model, X_train, y_train)
save_model(model, dirpath, 'logistic_regression')
make_predictions(model, X_test, y_test, dirpath, 'logistic_regression')

# logreg = logreg.fit(X_train, y_train)

# predictions = logreg.predict(X_test)

# print(confusion_matrix(y_test,predictions))
# print(classification_report(y_test,predictions))
# print(accuracy_score(y_test, predictions))

[[ 99  78]
 [ 37 145]]
{'0': {'precision': 0.7279411764705882, 'recall': 0.559322033898305, 'f1-score': 0.6325878594249201, 'support': 177}, '1': {'precision': 0.6502242152466368, 'recall': 0.7967032967032966, 'f1-score': 0.7160493827160495, 'support': 182}, 'accuracy': 0.6796657381615598, 'macro avg': {'precision': 0.6890826958586125, 'recall': 0.6780126653008008, 'f1-score': 0.6743186210704848, 'support': 359}, 'weighted avg': {'precision': 0.6885414913932646, 'recall': 0.6796657381615598, 'f1-score': 0.6748998294499494, 'support': 359}}
0.6796657381615598


In [None]:
wrongs = []
for i in range(len(predictions)):
  if predictions[i] != y_test[i]:
    print(df_test.iloc[i]['tweet'], predictions[i], y_test[i])

In [None]:
# train decision tree
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier()
model = train(model, X_train, y_train)
save_model(model, dirpath, 'decision_tree')
make_predictions(model, X_test, y_test, dirpath, 'decision_tree')

[[100  77]
 [ 58 124]]
{'0': {'precision': 0.6329113924050633, 'recall': 0.5649717514124294, 'f1-score': 0.5970149253731344, 'support': 177}, '1': {'precision': 0.6169154228855721, 'recall': 0.6813186813186813, 'f1-score': 0.6475195822454307, 'support': 182}, 'accuracy': 0.6239554317548747, 'macro avg': {'precision': 0.6249134076453178, 'recall': 0.6231452163655553, 'f1-score': 0.6222672538092826, 'support': 359}, 'weighted avg': {'precision': 0.6248020150999173, 'recall': 0.6239554317548747, 'f1-score': 0.6226189575479476, 'support': 359}}
0.6239554317548747


In [None]:
# train random forest

from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()

model = train(model, X_train, y_train)
save_model(model, dirpath, 'random_forest')
make_predictions(model, X_test, y_test, dirpath, 'random_forest')

[[ 90  87]
 [ 33 149]]
{'0': {'precision': 0.7317073170731707, 'recall': 0.5084745762711864, 'f1-score': 0.6, 'support': 177}, '1': {'precision': 0.6313559322033898, 'recall': 0.8186813186813187, 'f1-score': 0.7129186602870812, 'support': 182}, 'accuracy': 0.6657381615598886, 'macro avg': {'precision': 0.6815316246382803, 'recall': 0.6635779474762525, 'f1-score': 0.6564593301435406, 'support': 359}, 'weighted avg': {'precision': 0.6808327988383513, 'recall': 0.6657381615598886, 'f1-score': 0.6572456717889938, 'support': 359}}
0.6657381615598886


In [None]:
# train bernoulli nb since features are discrete

from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB()

model = train(model, X_train, y_train)
save_model(model, dirpath, 'b_nb')
make_predictions(model, X_test, y_test, dirpath, 'b_nb')

[[108  69]
 [ 32 150]]
{'0': {'precision': 0.7714285714285715, 'recall': 0.6101694915254238, 'f1-score': 0.6813880126182965, 'support': 177}, '1': {'precision': 0.684931506849315, 'recall': 0.8241758241758241, 'f1-score': 0.7481296758104738, 'support': 182}, 'accuracy': 0.7186629526462396, 'macro avg': {'precision': 0.7281800391389432, 'recall': 0.7171726578506239, 'f1-score': 0.7147588442143852, 'support': 359}, 'weighted avg': {'precision': 0.7275776918925696, 'recall': 0.7186629526462396, 'f1-score': 0.7152236190277012, 'support': 359}}
0.7186629526462396


In [None]:
# try multinomialNB

from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model = train(model, X_train, y_train)
save_model(model, dirpath, 'm_nb')
make_predictions(model, X_test, y_test, dirpath, 'm_nb')

[[138  39]
 [ 56 126]]
{'0': {'precision': 0.711340206185567, 'recall': 0.7796610169491526, 'f1-score': 0.7439353099730459, 'support': 177}, '1': {'precision': 0.7636363636363637, 'recall': 0.6923076923076923, 'f1-score': 0.7262247838616716, 'support': 182}, 'accuracy': 0.7353760445682451, 'macro avg': {'precision': 0.7374882849109654, 'recall': 0.7359843546284224, 'f1-score': 0.7350800469173587, 'support': 359}, 'weighted avg': {'precision': 0.7378524642804, 'recall': 0.7353760445682451, 'f1-score': 0.7349567145628226, 'support': 359}}
0.7353760445682451


In [None]:
import xgboost as xgb

model = xgb.XGBClassifier(objective = 'binary:logistic', random_state = 42)

model = train(model, X_train, y_train)
save_model(model, dirpath, 'xgb_clf')
make_predictions(model, X_test, y_test, dirpath, 'xgb_clf')

[[ 67 110]
 [ 27 155]]
{'0': {'precision': 0.7127659574468085, 'recall': 0.3785310734463277, 'f1-score': 0.49446494464944646, 'support': 177}, '1': {'precision': 0.5849056603773585, 'recall': 0.8516483516483516, 'f1-score': 0.6935123042505593, 'support': 182}, 'accuracy': 0.6183844011142061, 'macro avg': {'precision': 0.6488358089120835, 'recall': 0.6150897125473397, 'f1-score': 0.5939886244500029, 'support': 359}, 'weighted avg': {'precision': 0.6479454168712099, 'recall': 0.6183844011142061, 'f1-score': 0.5953747481241053, 'support': 359}}
0.6183844011142061


In [None]:
from sklearn.ensemble import AdaBoostClassifier

model = AdaBoostClassifier()

model = train(model, X_train, y_train)
save_model(model, dirpath, 'adaboost_clf')
make_predictions(model, X_test, y_test, dirpath, 'adaboost_clf')

[[ 59 118]
 [ 30 152]]
{'0': {'precision': 0.6629213483146067, 'recall': 0.3333333333333333, 'f1-score': 0.44360902255639095, 'support': 177}, '1': {'precision': 0.562962962962963, 'recall': 0.8351648351648352, 'f1-score': 0.672566371681416, 'support': 182}, 'accuracy': 0.5877437325905293, 'macro avg': {'precision': 0.6129421556387848, 'recall': 0.5842490842490843, 'f1-score': 0.5580876971189035, 'support': 359}, 'weighted avg': {'precision': 0.6122460666043027, 'recall': 0.5877437325905293, 'f1-score': 0.5596821076281306, 'support': 359}}
0.5877437325905293
