# Logistic Regression Classifier

In [1]:
import pandas as pd

### Import data

In [2]:
data = pd.read_csv('../data_cleaning/preprocessed_data.csv')
data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0,text_lemmatized,sentiment
0,httptwitpiccom 2y1zl awww bummer shoulda got d...,-1
1,upset cannot update facebook texting might cry...,-1
2,dived many time ball managed save 50 rest go b...,-1
3,whole body feel itchy like fire,-1
4,behaving mad cannot see,-1
...,...,...
1599995,woke school best feeling ever,1
1599996,thewdbcom cool hear old walt interview ♫ httpb...,1
1599997,ready mojo makeover ask detail,1
1599998,happy 38th birthday boo alll time tupac amaru ...,1


In [3]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

### TF-IDF

In [4]:
X_train, X_test, y_train, y_test = train_test_split(data["text_lemmatized"],data["sentiment"],test_size=0.2,shuffle=True)

In [5]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

### Logistic Regression Classifier

In [6]:
lr = LogisticRegression()
lr.fit(X_train_vectors_tfidf,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [7]:
y_predict = lr.predict(X_test_vectors_tfidf)
y_prob = lr.predict_proba(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)

              precision    recall  f1-score   support

          -1       0.79      0.76      0.78    158972
           1       0.77      0.80      0.78    159638

    accuracy                           0.78    318610
   macro avg       0.78      0.78      0.78    318610
weighted avg       0.78      0.78      0.78    318610

Confusion Matrix: [[121205  37767]
 [ 32437 127201]]
AUC: 0.8595719343570171


### Test Case

In [8]:
test_cases = X_test.to_frame()
test_cases

Unnamed: 0,text_lemmatized
1539625,image cultura website almost done almost offishul
649885,either crowd chanted horrible thing choreograp...
147016,could send dm got coworker always way need sty...
1216859,oo go holiday
1516576,look like nice one thanks letting u know work ...
...,...
1559519,someone made happy today going try keep happy
1043907,watcha gonna teach
242967,enormous headache
1161467,checking multiply site httpnick 0lemultiplycom


In [9]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text_lemmatized'])
prediction = lr.predict(test_tfidf)
prediction_prob = lr._predict_proba_lr(test_tfidf)[:,1]

test_cases['predict_prob'] = prediction_prob
test_cases['predicted sentiment'] = prediction
test_cases

Unnamed: 0,text_lemmatized,predict_prob,predicted sentiment
1539625,image cultura website almost done almost offishul,0.620718,1
649885,either crowd chanted horrible thing choreograp...,0.046117,-1
147016,could send dm got coworker always way need sty...,0.544140,1
1216859,oo go holiday,0.534693,1
1516576,look like nice one thanks letting u know work ...,0.889024,1
...,...,...,...
1559519,someone made happy today going try keep happy,0.933578,1
1043907,watcha gonna teach,0.779306,1
242967,enormous headache,0.059894,-1
1161467,checking multiply site httpnick 0lemultiplycom,0.893399,1


In [10]:
final_prediction = []
for i in prediction_prob:
    if i > 0.65:
        final_prediction.append(1)
    elif i < 0.45:
        final_prediction.append(-1)
    else:
        final_prediction.append(0)

In [11]:
final_prediction_arr = np.array(final_prediction)
test_cases['predicted sentiment'] = final_prediction_arr
test_cases

Unnamed: 0,text_lemmatized,predict_prob,predicted sentiment
1539625,image cultura website almost done almost offishul,0.620718,0
649885,either crowd chanted horrible thing choreograp...,0.046117,-1
147016,could send dm got coworker always way need sty...,0.544140,0
1216859,oo go holiday,0.534693,0
1516576,look like nice one thanks letting u know work ...,0.889024,1
...,...,...,...
1559519,someone made happy today going try keep happy,0.933578,1
1043907,watcha gonna teach,0.779306,1
242967,enormous headache,0.059894,-1
1161467,checking multiply site httpnick 0lemultiplycom,0.893399,1


### Test on single text

In [12]:
text = "things to bring when moving to another country"
text_array = []
text_array.append(text)
text_tfidf = tfidf_vectorizer.transform(text_array)

In [13]:
test = pd.DataFrame(text_array, columns=['text'])

In [14]:
y = lr.predict(text_tfidf)
y_prob = lr.predict_proba(text_tfidf)[:,1]
for i in y_prob:
    if i > 0.65:
        test['pred sentiment'] = 1
    elif i < 0.45:
        test['pred sentiment'] = -1
    else:
        test['pred sentiment'] = 0

test['predict prob'] = y_prob

In [15]:
test

Unnamed: 0,text,pred sentiment,predict prob
0,things to bring when moving to another country,0,0.509199
