In [1]:
#import tweepy
import pandas as pd
import numpy as np
import re
from datetime import datetime as dt

import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
columns = ["target","id","date","flag","user","text"]
data = pd.read_csv("https://www.kaggle.com/kazanova/sentiment140/data", names=columns, index_col=False)
# Download the dataset linked here ^ and

data.drop(["flag","date"], axis=1, inplace=True)

In [4]:
delete = re.compile("[.;:!\'?,\"()\[\]*%$&^{}\\<>+=\-_]")

In [5]:
data["text"].replace(delete, "", regex=True)
data["text"] = data["text"].map(lambda x: "".join(["" if "@" in l else l+" " for l in x.split()]))

In [6]:
df_test  = data.iloc[0::2].reset_index(drop=True)
df_train = data.iloc[1::2].reset_index(drop=True)

In [7]:
tweets = list(df_train["text"])
tests = list(df_test["text"])

In [8]:
vectorizer=CountVectorizer()

In [30]:
vectorizer.fit(tweets)
X = vectorizer.transform(tweets)
X_test = vectorizer.transform(tests)

In [31]:
target=list(df_train["target"])
target_test=list(df_test["target"])

In [34]:
for c in [0.01, 0.05, 0.25, 0.5, 1]:
    lr = LogisticRegression(C=c)
    lr.fit(X, target)
    print ("C=%s: %s" % (c, accuracy_score(target_test, lr.predict(X_test))))

C=0.01: 0.78603625
C=0.05: 0.79417
C=0.25: 0.79679625
C=0.5: 0.7964025
C=1: 0.79519375


In [35]:
model = LogisticRegression(C=0.25)
model.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target_test, model.predict(X_test)))

Final Accuracy: 0.79679625


In [36]:
feature_to_coef = {word: coef for word, coef in zip(vectorizer.get_feature_names(), model.coef_[0])}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_positive)

for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_negative)

('worries', 2.195456811966638)
('smiling', 2.0360304485790928)
('pleasure', 1.9308718835996483)
('congratulations', 1.8983993713855165)
('welcome', 1.8640109852922537)
('sad', -3.0353807287748684)
('rip', -2.611159114085779)
('sadly', -2.5755503992475752)
('disappointing', -2.533280269023295)
('fathers', -2.5214688997324273)


In [48]:
model2 = LogisticRegression(C=0.01)
model2.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target_test, model2.predict(X_test)))

Final Accuracy: 0.78603625


In [49]:
feature_to_coef = {word: coef for word, coef in zip(vectorizer.get_feature_names(), model2.coef_[0])}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_positive)

for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_negative)

('thanks', 1.4589126544621156)
('welcome', 1.3926073711360265)
('thank', 1.2536384530280948)
('excited', 1.1545051093182543)
('glad', 1.1453016719181912)
('sad', -2.5664253227601814)
('miss', -2.072448674501172)
('sucks', -1.8509703880678698)
('poor', -1.81921839810167)
('missing', -1.7559944884977308)


In [50]:
model2 = LogisticRegression(C=0.001)
model2.fit(X, target)
print ("Final Accuracy: %s" % accuracy_score(target_test, model2.predict(X_test)))

Final Accuracy: 0.76096375


In [51]:
feature_to_coef = {word: coef for word, coef in zip(vectorizer.get_feature_names(), model2.coef_[0])}

for best_positive in sorted(feature_to_coef.items(), key=lambda x: x[1], reverse=True)[:5]:
    print (best_positive)

for best_negative in sorted(feature_to_coef.items(), key=lambda x: x[1])[:5]:
    print (best_negative)

('thanks', 1.1210364239110544)
('happy', 0.7739457733382591)
('thank', 0.7390255941767985)
('great', 0.7082352886986772)
('love', 0.6929043812828807)
('sad', -1.5149240090928795)
('miss', -1.40320095914648)
('sorry', -0.991527692992244)
('hate', -0.9518025883213777)
('wish', -0.94933310020284)


# This is actually pretty interesting

It turns out, as the regularization and accuracy increase, the words become more and more "overfit", in that they make less sense. E.g., "fathers" probably doesn't belong in the negative sentiment words list.

One possible explanation is that accuracy is a bad measure for this, and that it would be better to use another measure to figure out which regularization coefficient to use. Also, working with other hyperparameters might have helped the model be more accurate. I wrote this all at 11 pm though, so I'm probably going to try that stuff out some other time when I'm not trying to get sleep :/

In conclusion I should probably get some more knowlege about logistic regression

Borrowed the top/bottom words calculator and some other bits+pieces from https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184.