In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import preprocessor as p
import string
import re
import pickle as pkl

In [None]:
sent140tweets = pd.read_csv('Desktop/full_tweets.csv', usecols = [0,5], names = ['label','tweet'], encoding = 'latin-1')

In [None]:
responses = pd.read_csv('Downloads/MADS Tweet Sentiment Labeling (Responses) - Form Responses 1.csv').T.iloc[1:,:].reset_index()

In [None]:
vader_labeled_tweets = pd.read_csv('Desktop/sampled_tweets.csv')
vader_labeled_tweets['sentiment'] = vader_labeled_tweets['sentiment'].map({'negative':0,'neutral':4,'positive':4})

In [None]:
def mapper(x):
    if x == 'Negative':
        return 1
    else:
        return 0

In [None]:
responses.iloc[:,1:] = responses.iloc[:,1:].applymap(lambda x : mapper(x))

In [None]:
responses['%neg'] = responses.iloc[:,1:].sum(axis=1)/14
responses['%pos-neut'] = 1-responses['%neg']
responses['label'] = np.round_(responses.iloc[:,1:].sum(axis=1)/14,0).astype(int)
responses = responses[['index', 'label','%neg','%pos-neut']]

In [None]:
all_neg = responses[responses['%neg']==1].count()
all_posneut = responses[responses['%pos-neut']==1].count()
all_neg, all_posneut

In [None]:
responses['index'] = responses['index'].str[3:]
responses = responses.rename(columns={'index':'tweet'})

In [None]:
responses['label'] = responses['label'].map({1:0, 0:4})
responses['vader_label'] = vader_labeled_tweets['sentiment']

In [None]:
from sklearn.metrics import accuracy_score
vader_acc = accuracy_score(responses['label'], responses['vader_label'])

In [None]:
responses.groupby('label').count()

In [None]:
with open('/Users/ryanmaloney/Downloads/phrasemodel_SVC.sav', 'rb') as f:
    phrase_model = pkl.load(f)

In [None]:
#LSI preprocessing 
from gensim.parsing.preprocessing import preprocess_string, STOPWORDS
CUSTOM_STOP_WORDS = ['www','twitpic','tinyurl','com', 'https', 'http', '&amp', 'rt', 'bit', 'ly', 'bitly']
FULL_STOP = STOPWORDS.union(set(CUSTOM_STOP_WORDS))

def preprocess_tweet_body(path, tweet_body):
    """Converts a single Tweet text into a list of bigrams for classification.

    :param path: the path to the pickled phrase model
    :param tweet_body: the text content of a single Tweet
    :return: List[str] underscores between bigrams in single str
    """
    phrase_model = pkl.load(open(path, 'rb'))
    tweet_tokens = preprocess_string(tweet_body)
    tweet_tokens = [word for word in tweet_tokens if word not in FULL_STOP]
    return phrase_model[tweet_tokens]

tweets = responses['tweet']
tweets = tweets.apply(lambda x : preprocess_tweet_body('/Users/ryanmaloney/Downloads/phrasemodel_SVC.sav', x))
labeled_tweets = sent140tweets['tweet']
labeled_tweets = labeled_tweets.apply(lambda x : preprocess_tweet_body('/Users/ryanmaloney/Downloads/phrasemodel_SVC.sav', x))

In [None]:
with open('/Users/ryanmaloney/Desktop/LinearSVCModel.sav', 'rb') as f:
    SVC_model = pkl.load(f)

In [None]:
SVC_preds = []
SVC_pred_neg = []
SVC_pred_posneut = []
for tokens in tweets:
    tweet = [" ".join(tokens)]
    pred = SVC_model.predict(tweet)
    probas = SVC_model.predict_proba(tweet)[0]
    SVC_pred_neg.append(probas[0])
    SVC_pred_posneut.append(probas[1])
    SVC_preds.append(pred[0])

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

In [None]:
responses['SVC_preds'] = SVC_preds
responses['SVC_pred_neg'] = SVC_pred_neg
responses['SVC_pred_posneut'] = SVC_pred_posneut

In [None]:
sampled_sent140 = sent140_tweets.sample(n=100000,random_state=42).reset_index(drop=True)

In [None]:
pred_training = []
pred_proba_training_neg = []
pred_proba_training_posneut = []
for tokens in tqdm(sampled_sent140['tokenized_tweets'].values):
    tweet = [" ".join(tokens)]
    pred = SVC_model.predict(tweet)
    probas = SVC_model.predict_proba(tweet)[0]
    pred_proba_training_neg.append(probas[0])
    pred_proba_training_posneut.append(probas[1])
    pred_training.append(pred[0])

In [None]:
training_preds_df = pd.DataFrame()
training_preds_df['Negative_Probability'] = pred_proba_training_neg
training_preds_df['PositiveNeutral_Probability'] = pred_proba_training_posneut
training_preds_df['Prediction'] = pred_training
training_preds_df['True_Label'] = sampled_sent140['label']

In [None]:
hu_labeled_report = classification_report(responses['label'],responses['SVC_preds'], output_dict=True)
hu_labeled_report_df = pd.DataFrame(hu_labeled_report).T
hu_labeled_report_df = hu_labeled_report_df.rename(index={'0':'Negative (0)', '4':'Postive/Neutral (4)'})
hu_labeled_report_df[['precision','recall','f1-score']] = hu_labeled_report_df[['precision','recall','f1-score']]*100
hu_labeled_report_df = hu_labeled_report_df.round(1)
hu_labeled_report_df.iloc[2,:2] = ''
hu_labeled_report_df.iloc[2,3] = ''

hu_labeled_report_df

In [None]:
import matplotlib.pyplot as plt

In [None]:
import seaborn as sns

In [None]:
#Confusion Matrix, Human-Labeled Data
confusion_mat = confusion_matrix(responses['label'],responses['SVC_preds'])
fig, ax = plt.subplots(figsize=(4,4))
sns.heatmap(confusion_mat,annot=True,fmt='g',
            ax=ax,xticklabels=['Negative','Positive'],yticklabels=['Negative','Positive'])
ax.set_title('Confusion Matrix, LinearSVC on Human-Labeled Tweets')
plt.savefig('Desktop/Capstone_Figs/human_labeled_confusionmatrix')

In [None]:
fpr, tpr, thresh = roc_curve(responses['label'], responses['SVC_pred_posneut'], pos_label=4)
roc_auc_hu_tweets = auc(fpr, tpr)
roc_auc_hu_tweets

In [None]:
#fig, ax = plt.subplots()
#sns.lineplot(x=fpr,y=tpr, color='navy')
plt.plot(fpr, tpr, color='firebrick')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - AUC 79.6% (Linear SVC on Human-Labeled Tweets)')

In [None]:
fpr_training, tpr_training, thresh = roc_curve(training_preds_df['True_Label'], training_preds_df['PositiveNeutral_Probability'], pos_label=4)
roc_auc_training_tweets = auc(fpr_training, tpr_training)
roc_auc_training_tweets

In [None]:
plt.plot(fpr_training, tpr_training, color='navy')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - AUC 72.8% (Linear SVC on Training Tweets)')

In [None]:
responses['Positive/Neutral Prediction Difference'] = responses['%pos-neut']-responses['SVC_pred_posneut']

In [None]:
neg_only = responses[responses['label']==0]
pos_only = responses[responses['label']==4]

fig, ax = plt.subplots()
sns.set(rc={"figure.figsize":(10, 10)})
sns.set_theme(style="white", palette=None)

neg_palette = neg_only['Positive/Neutral Prediction Difference'].apply(lambda x : 'navy' if x > 0 else 'firebrick').values
neg_bar = sns.barplot(x=neg_only.index.values, y='Positive/Neutral Prediction Difference', data=neg_only, palette = neg_palette,
                     ax=ax)
ax.set_ylabel('<-Human Labeled More Negative             Human Labeled Less Negative->', size=14)
ax.get_xaxis().set_ticks([])
ax.set_title('Positive Probability Difference Between Human Labeling and SVC, Negative Tweets', size=14)
plt.savefig('Desktop/Capstone_Figs/barplot_neg.png')


In [None]:
fig, ax = plt.subplots()

pos_palette = pos_only['Positive/Neutral Prediction Difference'].apply(lambda x : 'navy' if x > 0 else 'firebrick').values
sns.set(rc={"figure.figsize":(10, 10)})
sns.set_theme(style="white", palette=None)


sns.barplot(x=pos_only.index.values, y='Positive/Neutral Prediction Difference', data=pos_only, palette = pos_palette)
ax.set_title('Positive Probability Difference Between Human Labeling and SVC, Positive Tweets', size=14)
ax.set_ylabel('<-Human Labeled Less Positive             Human Labeled More Positivee->', size =14)
ax.get_xaxis().set_ticks([])
plt.savefig('Desktop/Capstone_Figs/barplot_pos.png')


In [None]:
fig, ax = plt.subplots()
sns.set(rc={"figure.figsize":(9, 7)})
sns.set_theme(style="white", palette=None)
sns.kdeplot(data=training_preds_df, x='PositiveNeutral_Probability',
            fill=True, alpha=.2, bw_adjust=.5, color='navy', ax=ax).set(
    title='Positive/Neutral Prediction Distribution, Training Tweets')
ax.set_xticks([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xticklabels([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xlabel('Positive or Neutral Probability')
plt.savefig('Desktop/Capstone_Figs/pos_prob_trainingtweets')

In [None]:
training_preds_neg = training_preds_df[training_preds_df['True_Label']==0]

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=training_preds_neg, x='PositiveNeutral_Probability',
            fill=True, alpha=.2, bw_adjust=.5, color='navy',ax=ax).set(
    title='Positive/Neutral Prediction Distribution, Training Negatives')
ax.set_xticks([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xticklabels([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xlabel('Positive or Neutral Probability')
plt.savefig('Desktop/Capstone_Figs/Pos_predicted_training_negs.png')

In [None]:
responses_neg = responses[responses['label']==0]

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=responses, x='SVC_pred_posneut',
            fill=True, alpha=.2, bw_adjust=.5, color='firebrick',ax=ax).set(
    title='Positive Prediction Distribution, Human-Labeled Tweets')
ax.set_xticks([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xticklabels([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xlabel('Positive or Neutral Probability')
plt.savefig('Desktop/Capstone_Figs/Pos_predicted_prob_human_labeled.png')

In [None]:
fig, ax = plt.subplots()
sns.kdeplot(data=responses_neg, x='SVC_pred_posneut',
            fill=True, alpha=.2, bw_adjust=.5, color='firebrick', ax=ax).set(
    title='Positive/Neutral Prediction Distribution, Human-Labeled Negatives')
ax.set_xticks([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xticklabels([0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0])
ax.set_xlabel('Positive or Neutral Probability')
plt.savefig('Desktop/Capstone_Figs/Pos_predicted_prob_human_labeled_negs.png')