In [1]:
import pandas as pd

### Import data

In [2]:
data = pd.read_csv('preprocessed_data.csv')
data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0,text_lemmatized,sentiment
0,switchfoot httptwitpiccom2y1zl awww bummer sho...,0
1,upset cannot update facebook texting might cry...,0
2,kenichan dived many time ball managed save 50 ...,0
3,whole body feel itchy like fire,0
4,nationwideclass behaving mad cannot see,0
...,...,...
1599995,woke school best feeling ever,1
1599996,thewdbcom cool hear old walt interview ♫ httpb...,1
1599997,ready mojo makeover ask detail,1
1599998,happy 38th birthday boo alll time tupac amaru ...,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

### TF-IDF

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data["text_lemmatized"],data["sentiment"],test_size=0.2,shuffle=True)

In [5]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

### Logistic Regression Classifier

In [6]:
lr = LogisticRegression()
lr.fit(X_train_vectors_tfidf,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_predict = lr.predict(X_test_vectors_tfidf)
y_prob = lr.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

           0       0.79      0.77      0.78    160141
           1       0.77      0.80      0.79    159785

    accuracy                           0.78    319926
   macro avg       0.78      0.78      0.78    319926
weighted avg       0.78      0.78      0.78    319926

Confusion Matrix: [[122858  37283]
 [ 31852 127933]]
AUC: 0.8632340906474503


### Test Case

In [8]:
test_cases = X_test.to_frame()
test_cases

Unnamed: 0,text_lemmatized
1396042,mikachu84 absolutely bad weekend thats lol
66973,obnoxiousacorns afraid twittered everything
1515888,well done braving 14 degree sognsvann nordmark...
364648,anyone like 1985 movie phenomenonback future a...
398961,homework
...,...
1560425,hello woke accompanying mum go hospital check ...
110326,halfacanyon two week tomorrow must write devol...
855147,ashsimpsonwentz gnight
1505661,lakers ot went exactly


In [9]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text_lemmatized'])
prediction = lr.predict(test_tfidf)
prediction_prob = lr._predict_proba_lr(test_tfidf)[:,1]

test_cases['predict_prob'] = prediction_prob
test_cases['predicted sentiment'] = prediction

test_cases

Unnamed: 0,text_lemmatized,predict_prob,predicted sentiment
1396042,mikachu84 absolutely bad weekend thats lol,0.493538,0
66973,obnoxiousacorns afraid twittered everything,0.253162,0
1515888,well done braving 14 degree sognsvann nordmark...,0.684442,1
364648,anyone like 1985 movie phenomenonback future a...,0.879123,1
398961,homework,0.097314,0
...,...,...,...
1560425,hello woke accompanying mum go hospital check ...,0.568867,1
110326,halfacanyon two week tomorrow must write devol...,0.344660,0
855147,ashsimpsonwentz gnight,0.931916,1
1505661,lakers ot went exactly,0.676502,1


### Test on single text

In [152]:
text = "love the people"
text_array = []
text_array.append(text)
text_tfidf = tfidf_vectorizer.transform(text_array)

In [153]:
test = pd.DataFrame(text_array, columns=['text'])

In [154]:
y = lr.predict(text_tfidf)
y_prob = lr.predict_proba(text_tfidf)[:,1]
test['pred sentiment'] = y
test['predict prob'] = y_prob

In [155]:
test

Unnamed: 0,text,pred sentiment,predict prob
0,love the people,1,0.889488
