In [153]:
import pandas as pd
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import re
from sklearn.preprocessing import MinMaxScaler

In [154]:
stop_words = set(stopwords.words('english'))

porter = PorterStemmer()


def stopword_remover(lst):
    return " ".join([word for word in lst.split() if word not in stop_words])


def stemmer(lst):
    return " ".join([porter.stem(word) for word in lst.split()])


def remove_punc(sent):
    sent_without_punc = ""
    for char in sent:
        if char not in string.punctuation:
            sent_without_punc += char
        else:
            sent_without_punc += " "
    return sent_without_punc


def text_preprocess(original_text):
    processed_text = original_text.lower()
    #processed_text = stemmer(processed_text)
    processed_text = stopword_remover(processed_text)
    processed_text = remove_punc(processed_text)
    return processed_text

In [155]:
df = pd.read_csv('tweets.csv', encoding='cp1252', low_memory=False)
# https://www.kaggle.com/datasets/kazanova/sentiment140?resource=download
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [156]:
df.columns = ["sentiment", "id", "date", "query_flag", "user", "text"]
df["sentiment"] = df["sentiment"].apply(
    lambda val: "negative" if val == 0 else "positive")  # 0 = negative, 4 = positive
df.drop(["query_flag", "date"], axis=1, inplace=True)
df[:100:10]

Unnamed: 0,sentiment,id,user,text
0,negative,1467810672,scotthamilton,is upset that he can't update his Facebook by ...
10,negative,1467812579,pardonlauren,I just re-pierced my ears
20,negative,1467813992,swinspeedx,"one of my friend called me, and asked to meet ..."
30,negative,1467815924,EmCDL,@alielayus I want to go to promote GEAR AND GR...
40,negative,1467818481,lionslamb,He's the reason for the teardrops on my guitar...
50,negative,1467820906,voyage2k,"@localtweeps Wow, tons of replies from you, ma..."
60,negative,1467822918,krbleyle,just leaving the parking lot of work!
70,negative,1467825642,timmelko,@ninjen I'm sure you're right... I need to ...
80,negative,1467834227,driveaway2008,"@statravelAU just got ur newsletter, those far..."
90,negative,1467835577,viviana09,wednesday my b-day! don't know what 2 do!!


In [157]:
up_to_equal_sign_regex = re.compile(
    "^[^=]*.")  # matches line up to and including equals sign
subj_lex_rows = []

for line in open("subjclueslen1-HLTEMNLP05.tff").readlines():
    line_vals = line.split()

    for index, val in enumerate(line_vals):
        if "=" not in val:
            line_vals.pop(index)
        if val.startswith("mpqapolarity") or val.startswith("polarity"):
            line_vals.pop(index)

    for index, val in enumerate(line_vals):
        line_vals[index] = re.sub(up_to_equal_sign_regex, "", val)

    word_type, word_len, word, pos, stemmed, polarity = line_vals
    subj_lex_rows.append([word_type, word, pos, stemmed, polarity])

subjectivity_lexicon_df = pd.DataFrame(
    subj_lex_rows, columns=["type", "word", "pos", "stemmed", "polarity"])
subjectivity_lexicon_df.head(10)

Unnamed: 0,type,word,pos,stemmed,polarity
0,weaksubj,abandoned,adj,n,negative
1,weaksubj,abandonment,noun,n,negative
2,weaksubj,abandon,verb,y,negative
3,strongsubj,abase,verb,y,negative
4,strongsubj,abasement,anypos,y,negative
5,strongsubj,abash,verb,y,negative
6,weaksubj,abate,verb,y,negative
7,weaksubj,abdicate,verb,y,negative
8,strongsubj,aberration,adj,n,negative
9,strongsubj,aberration,noun,n,negative


In [158]:
subjectivity_lexicon_df.set_index("word", inplace=True)

In [159]:
subjectivity_lexicon_df.loc["happy"]

type        strongsubj
pos                adj
stemmed              n
polarity      positive
Name: happy, dtype: object

In [160]:
tweets = pd.DataFrame(df.sample(100000))
tweets

Unnamed: 0,sentiment,id,user,text
1461668,positive,2063954103,payunirbabae,@debianese It's nice to meet you &quot; face t...
891828,positive,1690978302,ColleenWild,@bobbyu714 :-\ Well there's next time. Weekend...
341895,negative,2015143970,Jantunstill,"@julesyog Oh, so you are hogging the sun today..."
632225,negative,2232599263,HouseofNoirLace,"@elegantslummer That's right, its nasty in NY ..."
558965,negative,2204875305,thejester100,@charmbracelet85 What reason have you to doubt...
...,...,...,...,...
1487300,positive,2068256571,CarlynJessica,hasn`t been keeping up with her tweets. How is...
1334566,positive,2016735582,violettepond,Why are they doing a Karate Kid remake? Should...
1509976,positive,2174891693,Courtneex3,@erliou56 oh yes it will
537569,negative,2198804046,cdub400H,bored as hell here in Statesboro


In [161]:
def calc_sentiment_rating(row):
    text = row["text"]
    text = text_preprocess(text)
    sentiment_rating = 0

    def find_sentiment(val):
        word = subjectivity_lexicon_df.loc[val]
        if isinstance(word, pd.DataFrame):
            word = word.iloc[0]
        polarity = word["polarity"]
        strength = word["type"]
        strength_multiplier = 1 if strength == "strongsubj" else 0.5
        if polarity == "positive":
            return strength_multiplier
        else:
            return -strength_multiplier

    for val in text.split():
        try:
            sentiment_rating += find_sentiment(val)
        except KeyError:
            try:
                # if word not found, try stem
                sentiment_rating += find_sentiment(stemmer(val))
            except KeyError:
                pass

    return sentiment_rating


def predict_sentiment(row):
    calculated_rating = row["sent_rating"]
    if abs(calculated_rating) < 1:
        return "neutral"
    if calculated_rating < 0:
        return "negative"
    return "positive"


tweets["sent_rating"] = tweets.apply(calc_sentiment_rating, axis=1)

scaler = MinMaxScaler(feature_range=(-3, 3))
scaled = scaler.fit_transform([[x] for x in tweets["sent_rating"]
                              ])  # normalize sentiment value
scaled = [int(val) for val in list(scaled)]  # round to nearest int

tweets["sent_rating"] = scaled

tweets["sentiment_predicted"] = tweets.apply(predict_sentiment, axis=1)
tweets.head(10)

Unnamed: 0,sentiment,id,user,text,sent_rating,sentiment_predicted
1461668,positive,2063954103,payunirbabae,@debianese It's nice to meet you &quot; face t...,0,neutral
891828,positive,1690978302,ColleenWild,@bobbyu714 :-\ Well there's next time. Weekend...,0,neutral
341895,negative,2015143970,Jantunstill,"@julesyog Oh, so you are hogging the sun today...",-1,negative
632225,negative,2232599263,HouseofNoirLace,"@elegantslummer That's right, its nasty in NY ...",0,neutral
558965,negative,2204875305,thejester100,@charmbracelet85 What reason have you to doubt...,0,neutral
452258,negative,2069974161,Reds72,Several people in my life need to get their ph...,0,neutral
394885,negative,2055738740,Jem7RB,"Been playing my old LP faded all afternoon, Da...",-1,negative
874807,positive,1680202163,missycaulk,@zigziggityzoo The HAPPY homeowner.,0,neutral
1058016,positive,1962821154,abpared,@ashleyasterisk I forgot too...and it just pop...,0,neutral
1558604,positive,2185871942,blueangel56,Anyone used or know about Zac Browser designed...,0,neutral


In [162]:
total = 0
correct = 0

for index, row in tweets.iterrows():
  if row["sent_rating"] != 0: # prediction made
    total += 1
    if row["sentiment"] == row["sentiment_predicted"]:
      correct += 1

print(f"Total predictions made: {total}")
print(f"Correct predictions: {correct}")
print(f"Percent correct: {correct/total*100}%")

Total predictions made: 14933
Correct predictions: 10704
Percent correct: 71.68017143239804%


In [163]:
total = 0
correct = 0

for index, row in tweets.iterrows():
  if abs(row["sent_rating"]) > 1: # strong prediction made
    total += 1
    if row["sentiment"] == row["sentiment_predicted"]:
      correct += 1

print(f"Strong predictions made: {total}")
print(f"Correct predictions: {correct}")
print(f"Percent correct: {correct/total*100}%")

Strong predictions made: 303
Correct predictions: 252
Percent correct: 83.16831683168317%
