In [1]:
import os
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score, confusion_matrix

import nltk
from nltk.classify import SklearnClassifier

import matplotlib.pyplot as plt
'exec(%matplotlib inline)'
from subprocess import check_output

In [2]:
data = pd.read_csv(os.getcwd()+"/input/Sentiment_GOP.csv", encoding = "ISO-8859-1")
data = data[['text','sentiment']]
data = data[data.sentiment!="Neutral"]

X_data = data['text']
Y_data = data['sentiment']

In [3]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

stopwords_set = set(stopwords.words("english")) 
tknzr = TweetTokenizer()
stemmer = SnowballStemmer("english")


In [4]:
X_clean = []
for txt in X_data:
    words = [word.lower() for word in tknzr.tokenize(txt) if len(word)>=3] 
    words_cleaned = [word for word in words
        if 'http' not in word
        and not word.startswith('@')
        and not word.startswith('#')
        and word != 'rt']  
    words_wo_stop = [word for word in words_cleaned if not word in stopwords_set]  
#     tags = []
#     for word in words_wo_stop:
#         tags.extend(nltk.pos_tag([word]))
#     tags_root = [(stemmer.stem(tag[0]), tag[1]) for tag in tags]
    words_root = [stemmer.stem(word) for word in words]
    txt_clean = ""
    for word in words_root:
        txt_clean += (word + " ")
    X_clean.append(txt_clean)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer 

X_train, X_test, y_train, y_test = train_test_split(X_clean, Y_data, test_size=0.1)
 
vectorizer=TfidfVectorizer(use_idf=True)
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [6]:
example=vectorizer.fit_transform(X_train)[0]
 
# place tf-idf values in a pandas data frame
df = pd.DataFrame(example.T.todense(), index=vectorizer.get_feature_names(), columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
join,0.473308
isi,0.438328
citizen,0.256360
warrant,0.256360
pamelagel,0.253492
sign,0.253492
death,0.238345
lose,0.223289
you,0.201769
ted,0.176781


In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(C=2.0, class_weight=None, dual=True, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)
classifier.fit(vectorizer.transform(X_train), y_train)

LogisticRegression(C=2.0, class_weight=None, dual=True, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [8]:
y_pred = classifier.predict(vectorizer.transform(X_test))
cm = confusion_matrix(y_test, y_pred, labels=["Positive", "Negative"])
tp, fn, fp, tn = cm.ravel()

In [9]:
print(cm)
print(tn, fp, fn, tp)

[[128 111]
 [ 24 810]]
810 24 111 128


In [10]:
print(accuracy_score(y_test, y_pred))
print(recall_score(y_test, y_pred, labels=["Positive", "Negative"], pos_label = "Positive"))
print(precision_score(y_test, y_pred, labels=["Positive", "Negative"], pos_label = "Positive"))
print(f1_score(y_test, y_pred, labels=["Positive", "Negative"], pos_label = "Positive"))

0.8741845293569431
0.5355648535564853
0.8421052631578947
0.6547314578005115


In [11]:
def accuracyScore(correct, total):
    return correct/total      #How likely that u answer right

def sensitivityScore(truePos, totalPos):
    return truePos/totalPos    #How likely detect positive when is positive
    
def specificityScore(trueNeg, totalNeg):
    return trueNeg/totalNeg   #How likely detect negative when is negative
    
def precisionScore(truePos, guessedPos):
    return truePos/guessedPos #How likely is positive when detect positive

def f1Score(sensitivity, precision):
    return 2*sensitivity*precision/(sensitivity+precision)  
    #weighted avg of recall and precision, useful if harm of falsePos and falseNeg differs


In [12]:
accuracy = accuracyScore(tp+tn, tp+tn+fp+fn)
sensitivity = sensitivityScore(tp,tp+fn)
specificity = specificityScore(tn, tn+fp)
precision = precisionScore(tp, tp+fp)
f1 = f1Score(sensitivity, precision)

print(accuracy)
print(sensitivity)
print(specificity)
print(precision)
print(f1)

0.8741845293569431
0.5355648535564853
0.9712230215827338
0.8421052631578947
0.6547314578005115
