In [105]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from gensim.models import Word2Vec
import spacy
import numpy as np
import pickle
from tqdm import tqdm
from catboost import CatBoostClassifier
import shap

In [106]:
sentiment_train_data = pd.read_csv("sentiment-train.csv")
sentiment_test_data = pd.read_csv("sentiment-test.csv")
train_x, train_y = sentiment_train_data["text"], sentiment_train_data["sentiment"]
test_x, test_y = sentiment_test_data["text"], sentiment_test_data["sentiment"]
train_x, train_y

(0             I LOVE @Health4UandPets u guys r the best!! 
 1        im meeting up with one of my besties tonight! ...
 2        @DaRealSunisaKim Thanks for the Twitter add, S...
 3        Being sick can be really cheap when it hurts t...
 4          @LovesBrooklyn2 he has that effect on everyone 
                                ...                        
 59995    best weekend ever; Caty Costigans house, Marle...
 59996    Oh FFS! I've been here all fucking day. Why de...
 59997    Leaving britney-just found out jon from new ki...
 59998    @Neomic I havta' go pee, but Im scared to walk...
 59999    Nooooooooooooooo!!!!!! School today. But the w...
 Name: text, Length: 60000, dtype: object,
 0        1
 1        1
 2        1
 3        1
 4        1
         ..
 59995    0
 59996    0
 59997    0
 59998    0
 59999    0
 Name: sentiment, Length: 60000, dtype: int64)

In [107]:
firstCv = CountVectorizer(max_features=1000)
firstVectorizedTrain = firstCv.fit_transform(train_x)
firstVectorizedTest = firstCv.transform(test_x)
firstVectorizedTrain

<60000x1000 sparse matrix of type '<class 'numpy.int64'>'
	with 529271 stored elements in Compressed Sparse Row format>

In [108]:
mnb = MultinomialNB()
mnb.fit(firstVectorizedTrain, train_y)

MultinomialNB()

In [109]:
preds = mnb.predict(firstVectorizedTest)
print(classification_report(test_y, preds))

              precision    recall  f1-score   support

           0       0.75      0.68      0.71       177
           1       0.71      0.79      0.75       182

    accuracy                           0.73       359
   macro avg       0.73      0.73      0.73       359
weighted avg       0.73      0.73      0.73       359



In [110]:
binary_cv = CountVectorizer(max_features = 1000, binary = True)
binaryVectorizedTrain = binary_cv.fit_transform(train_x)
binaryVectorizedTest = binary_cv.transform(test_x)
binary_mnb = MultinomialNB()
binary_mnb.fit(binaryVectorizedTrain, train_y)

MultinomialNB()

In [111]:
binaryPreds = binary_mnb.predict(binaryVectorizedTest)
print(classification_report(test_y, binaryPreds))

              precision    recall  f1-score   support

           0       0.76      0.67      0.71       177
           1       0.71      0.80      0.75       182

    accuracy                           0.74       359
   macro avg       0.74      0.73      0.73       359
weighted avg       0.74      0.74      0.73       359



In [112]:
log_reg = LogisticRegression(max_iter = 10**6)
log_reg.fit(firstVectorizedTrain, train_y)
logPreds = log_reg.predict(firstVectorizedTest)
print(classification_report(test_y, logPreds))

              precision    recall  f1-score   support

           0       0.82      0.68      0.75       177
           1       0.74      0.86      0.79       182

    accuracy                           0.77       359
   macro avg       0.78      0.77      0.77       359
weighted avg       0.78      0.77      0.77       359



In [113]:
bin_log_reg = LogisticRegression(max_iter = 10**6)
bin_log_reg.fit(binaryVectorizedTrain, train_y)
binLogPreds = bin_log_reg.predict(binaryVectorizedTest)
print(classification_report(test_y, binLogPreds))

              precision    recall  f1-score   support

           0       0.81      0.67      0.73       177
           1       0.72      0.85      0.78       182

    accuracy                           0.76       359
   macro avg       0.77      0.76      0.76       359
weighted avg       0.77      0.76      0.76       359



In [None]:
skf = StratifiedKFold(n_splits = 10)
for n_features in [1000, 2000, 3000, 4000]:
    without_binary = 0
    with_binary = 0
    for train_index, test_index in skf.split(train_x, train_y):
        vectorizer = CountVectorizer(max_features = n_features)
        vectorizedTrain = vectorizer.fit_transform(train_x[train_index])
        vectorizedTest = vectorizer.transform(train_x[test_index])
        binVectorizer = CountVectorizer(max_features = n_features, binary = True)
        binVectorizedTrain = binVectorizer.fit_transform(train_x[train_index])
        binVectorizedTest = binVectorizer.transform(train_x[test_index])
        
        model = MultinomialNB()
        model.fit(vectorizedTrain, train_y[train_index])
        preds = model.predict(vectorizedTest)
        without_binary += accuracy_score(train_y[test_index], preds)
        
        binModel = MultinomialNB()
        binModel.fit(binVectorizedTrain, train_y[train_index])
        preds = binModel.predict(binVectorizedTest)
        with_binary += accuracy_score(train_y[test_index], preds)
        
    print("With %d features and not binary: %.3f" % (n_features, without_binary / 10))
    print("With %d features and binary: %.3f" % (n_features, with_binary / 10))

In [None]:
vectorizer = CountVectorizer(max_features = 4000, binary = True)
vectorizedTrain = vectorizer.fit_transform(train_x)
vectorizedTest = vectorizer.transform(test_x)
model = MultinomialNB()
model.fit(vectorizedTrain, train_y)
preds = model.predict(vectorizedTest)
print(classification_report(test_y, preds))

In [None]:
nlp = spacy.load("en_core_web_sm")
def tokenize_tweet(x):
    tokens = []
    for sent in nlp(x).sents:
        tokens += [str(token) for token in sent]
    return tokens


In [None]:
def avgVec(model, sentence):
    vec = 0
    n = 0
    for word in sentence:
        try:
            vec += model.wv[word]
            n += 1
        except:
            pass
    if n != 0:
        return vec / n
    else:
        return np.zeros(300)

In [None]:
[x.split(" ") for x in train_x][0]

In [None]:
'''
tokenized_train = []
for i in tqdm(range(len(train_x))):
    tokenized_train.append(tokenize_tweet(train_x[i]))
    
with open("w2v.txt", "wb") as file:
    pickle.dump(tokenized_train, file)
'''

In [None]:
with open("w2v.txt", "rb") as file:
    tokenized_train = pickle.load(file)

In [None]:
model = Word2Vec(sentences=tokenized_train, vector_size=300)

In [None]:
vectorized_train = [avgVec(model, x) for x in tokenized_train]

In [None]:
log = LogisticRegression(max_iter = 10**6)
log.fit(vectorized_train, train_y)

In [None]:
vectorized_test = [avgVec(model, tokenize_tweet(x)) for x in test_x]

In [None]:
preds = log.predict(vectorized_test)
print(classification_report(test_y, preds))

In [None]:
all_train_tweets = pd.read_csv("training.1600000.processed.noemoticon.csv", header=None, usecols=[0, 5], encoding='latin-1')
all_train_tweets = all_train_tweets[all_train_tweets[0] != 2]
all_train_tweets.loc[all_train_tweets[0] == 4, 0] = 1 
all_train_tweets


In [None]:
'''
train_x = all_train_tweets[5]
train_y = all_train_tweets[0]
results = ""

skf = StratifiedKFold(n_splits = 10)
for n_features in [1000, 2000, 3000, 4000]:
    without_binary = 0
    with_binary = 0
    for train_index, test_index in skf.split(train_x, train_y):
        vectorizer = CountVectorizer(max_features = n_features)
        vectorizedTrain = vectorizer.fit_transform(train_x[train_index])
        vectorizedTest = vectorizer.transform(train_x[test_index])
        binVectorizer = CountVectorizer(max_features = n_features, binary = True)
        binVectorizedTrain = binVectorizer.fit_transform(train_x[train_index])
        binVectorizedTest = binVectorizer.transform(train_x[test_index])
        
        model = MultinomialNB()
        model.fit(vectorizedTrain, train_y[train_index])
        preds = model.predict(vectorizedTest)
        without_binary += accuracy_score(train_y[test_index], preds)
        
        binModel = MultinomialNB()
        binModel.fit(binVectorizedTrain, train_y[train_index])
        preds = binModel.predict(binVectorizedTest)
        with_binary += accuracy_score(train_y[test_index], preds)
    result1 = "With %d features and not binary: %.3f" % (n_features, without_binary / 10)
    result2 = "With %d features and binary: %.3f" % (n_features, with_binary / 10)
    results += result1 + "\n" + result2 + "\n"
    print(result1)
    print(result2)

with open("bonusResults.txt", "w") as file:
    file.write(results)
'''

In [None]:
with open("bonusResults.txt", "r") as file:
    print(file.read())

In [None]:
#Training on the entire data set with 4000 and binary
train_x = all_train_tweets[5]
train_y = all_train_tweets[0]
vectorizer = CountVectorizer(max_features = 4000, binary = True)
model = MultinomialNB()
train = vectorizer.fit_transform(train_x)
test = vectorizer.transform(test_x)
model.fit(train, train_y)
preds = model.predict(test)
print(classification_report(test_y, preds))
#Slight increase to .79 accuracy


In [None]:
features = cv.get_feature_names()
features[5:15]

In [None]:
goodIndex = features.index('good')
badIndex = features.index('bad')
goodCoef = log_reg.coef_[0][goodIndex]
badCoef = log_reg.coef_[0][badIndex]

goodIndex, goodCoef, badIndex, badCoef

In [None]:
#Most positive and most negative word
features[np.argmax(log_reg.coef_[0])], max(log_reg.coef_[0]), features[np.argmin(log_reg.coef_[0])], min(log_reg.coef_[0])

In [None]:
train_x, train_y = sentiment_train_data["text"], sentiment_train_data["sentiment"]
test_x, test_y = sentiment_test_data["text"], sentiment_test_data["sentiment"]
model = CatBoostClassifier()
model.fit(firstVectorizedTrain, train_y, verbose=False)
train_preds = model.predict(firstVectorizedTrain)
print(classification_report(train_y, train_preds))
preds = model.predict(firstVectorizedTest)
print(classification_report(test_y, preds))

In [None]:
explainer = shap.TreeExplainer(model, feature_names=firstCv.get_feature_names())
shap_values = explainer(firstVectorizedTrain)
shap.plots.beeswarm(shap_values)

In [None]:
print(train_x[3])
shap.initjs()
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(firstVectorizedTrain)
shap.plots.force(explainer.expected_value, shap_values[3], firstVectorizedTrain.toarray()[3], firstCv.get_feature_names())