In [1]:
import pandas as pd
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import string
import re

In [2]:
stop_words = set(stopwords.words('english'))

porter = PorterStemmer()


def stopword_remover(lst):
    return " ".join([word for word in lst.split() if word not in stop_words])


def stemmer(lst):
    return " ".join([porter.stem(word) for word in lst.split()])


def remove_punc(sent):
    sent_without_punc = ""
    for char in sent:
        if char not in string.punctuation:
            sent_without_punc += char
        else:
            sent_without_punc += " "
    return sent_without_punc


def text_preprocess(original_text):
    processed_text = original_text.lower()
    #processed_text = stemmer(processed_text)
    processed_text = stopword_remover(processed_text)
    processed_text = remove_punc(processed_text)
    return processed_text

In [3]:
df = pd.read_csv('tweets.csv', encoding='cp1252', low_memory=False)
# https://www.kaggle.com/datasets/kazanova/sentiment140?resource=download
df.head()

Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew


In [4]:
df.columns = ["sentiment", "id", "date", "query_flag", "user", "text"]
df["sentiment"] = df["sentiment"].apply(
    lambda val: "neg" if val == 0 else "pos")  # 0 = negative, 4 = positive
df.drop(["query_flag", "date"], axis=1, inplace=True)
df.head(10)

Unnamed: 0,sentiment,id,user,text
0,neg,1467810672,scotthamilton,is upset that he can't update his Facebook by ...
1,neg,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...
2,neg,1467811184,ElleCTF,my whole body feels itchy and like its on fire
3,neg,1467811193,Karoli,"@nationwideclass no, it's not behaving at all...."
4,neg,1467811372,joy_wolf,@Kwesidei not the whole crew
5,neg,1467811592,mybirch,Need a hug
6,neg,1467811594,coZZ,@LOLTrish hey long time no see! Yes.. Rains a...
7,neg,1467811795,2Hood4Hollywood,@Tatiana_K nope they didn't have it
8,neg,1467812025,mimismo,@twittera que me muera ?
9,neg,1467812416,erinx3leannexo,spring break in plain city... it's snowing


In [5]:
up_to_equal_sign_regex = re.compile(
    "^[^=]*.")  # matches line up to and including equals sign
subj_lex_rows = []

for line in open("subjclueslen1-HLTEMNLP05.tff").readlines():
    line_vals = line.split()

    for index, val in enumerate(line_vals):
        if "=" not in val:
            line_vals.pop(index)
        if val.startswith("mpqapolarity") or val.startswith("polarity"):
            line_vals.pop(index)

    for index, val in enumerate(line_vals):
        line_vals[index] = re.sub(up_to_equal_sign_regex, "", val)

    word_type, word_len, word, pos, stemmed, polarity = line_vals
    subj_lex_rows.append([word_type, word, pos, stemmed, polarity])

subjectivity_lexicon_df = pd.DataFrame(
    subj_lex_rows, columns=["type", "word", "pos", "stemmed", "polarity"])
subjectivity_lexicon_df.head()

Unnamed: 0,type,word,pos,stemmed,polarity
0,weaksubj,abandoned,adj,n,negative
1,weaksubj,abandonment,noun,n,negative
2,weaksubj,abandon,verb,y,negative
3,strongsubj,abase,verb,y,negative
4,strongsubj,abasement,anypos,y,negative


In [6]:
subjectivity_lexicon_df.set_index("word", inplace=True)

In [7]:
subjectivity_lexicon_df.loc["happy"]

type        strongsubj
pos                adj
stemmed              n
polarity      positive
Name: happy, dtype: object

In [8]:
tweets = pd.DataFrame(df.sample(20))
tweets

Unnamed: 0,sentiment,id,user,text
1144597,pos,1977642607,lovechrissy,@mitchelmusso hello las vegas!! Come hereeee
1263267,pos,1999002323,paccadoodle,@poodlez do you like them BBQ'd? That changes ...
531364,neg,2196297020,ChristianMarsh,Back from a lung-buster of a ride over Cannock...
1282835,pos,2001877503,phusionade,@hueyyei i read that book too. thumbs up! and ...
765916,neg,2299650006,CiakyAAR,i'm not so good tonight..i feel sick bad thin...
10205,neg,1550815767,kcatack,nooooo only one more day of holiday left ... ...
1193520,pos,1984279400,reganily,"ESTK's new record is on myspace, sooo good."
279708,neg,1991863171,rebeccairaheta,saying goodbye to the beach
197356,neg,1970980849,Crissie_Casiano,My allergies are kiiiilling me today. I can't ...
547695,neg,2202148651,tiffawahfoo,@pandabooBoo i NEEEEED A JOBBBBBB FOOOOOOL!!!


In [9]:
def calc_sentiment_rating(row):
    text = row["text"]
    text = text_preprocess(text)
    sentiment_rating = 0

    def find_sentiment(val):
        word = subjectivity_lexicon_df.loc[val]
        if isinstance(word, pd.DataFrame):
            word = word.iloc[0]
        polarity = word["polarity"]
        strength = word["type"]
        strength_multiplier = 1 if strength == "strongsubj" else 0.5
        if polarity == "positive":
            return strength_multiplier
        else:
            return -strength_multiplier

    for val in text.split():
        try:
            sentiment_rating += find_sentiment(val)
        except KeyError:
            try:
                # if word not found, try stem
                sentiment_rating += find_sentiment(stemmer(val))
            except KeyError:
                pass

    return sentiment_rating

def predict_sentiment(row):
    calculated_rating = row["sent_rating"]
    if abs(calculated_rating) < 1:
        return "neutral"
    if calculated_rating < 0:
        return "neg"
    return "pos"


tweets["sent_rating"] = tweets.apply(calc_sentiment_rating, axis=1)
tweets["sentiment_predicted"] = tweets.apply(predict_sentiment, axis=1)
tweets[:20]


Unnamed: 0,sentiment,id,user,text,sent_rating,sentiment_predicted
1144597,pos,1977642607,lovechrissy,@mitchelmusso hello las vegas!! Come hereeee,0.0,neutral
1263267,pos,1999002323,paccadoodle,@poodlez do you like them BBQ'd? That changes ...,1.0,pos
531364,neg,2196297020,ChristianMarsh,Back from a lung-buster of a ride over Cannock...,-1.5,neg
1282835,pos,2001877503,phusionade,@hueyyei i read that book too. thumbs up! and ...,-0.5,neutral
765916,neg,2299650006,CiakyAAR,i'm not so good tonight..i feel sick bad thin...,0.0,neutral
10205,neg,1550815767,kcatack,nooooo only one more day of holiday left ... ...,-1.0,neg
1193520,pos,1984279400,reganily,"ESTK's new record is on myspace, sooo good.",0.5,neutral
279708,neg,1991863171,rebeccairaheta,saying goodbye to the beach,0.0,neutral
197356,neg,1970980849,Crissie_Casiano,My allergies are kiiiilling me today. I can't ...,-0.5,neutral
547695,neg,2202148651,tiffawahfoo,@pandabooBoo i NEEEEED A JOBBBBBB FOOOOOOL!!!,0.0,neutral
