In [11]:
import pandas as pd

### Import data

In [12]:
data = pd.read_csv('preprocessed_data.csv')
data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0,text_lemmatized,sentiment
0,switchfoot httptwitpiccom2y1zl awww bummer sho...,0
1,upset cannot update facebook texting might cry...,0
2,kenichan dived many time ball managed save 50 ...,0
3,whole body feel itchy like fire,0
4,nationwideclass behaving mad cannot see,0
...,...,...
1599995,woke school best feeling ever,1
1599996,thewdbcom cool hear old walt interview ♫ httpb...,1
1599997,ready mojo makeover ask detail,1
1599998,happy 38th birthday boo alll time tupac amaru ...,1


In [13]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

### TF-IDF

In [14]:
X_train, X_test, y_train, y_test = train_test_split(data["text_lemmatized"],data["sentiment"],test_size=0.2,shuffle=True)

In [15]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

### Support Vector Machines (SVM) Classifier

#### Training

In [16]:
clf = LinearSVC(random_state=0)

In [17]:
clf.fit(X_train_vectors_tfidf,y_train)

#### Results

In [18]:
y_predict = clf.predict(X_test_vectors_tfidf)
y_prob = clf._predict_proba_lr(X_test_vectors_tfidf)[:,1]
print(classification_report(y_test,y_predict))
print('Confusion Matrix:',confusion_matrix(y_test, y_predict))
 
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)
print('AUC:', roc_auc)


              precision    recall  f1-score   support

           0       0.78      0.77      0.77    160053
           1       0.77      0.78      0.78    159873

    accuracy                           0.78    319926
   macro avg       0.78      0.78      0.78    319926
weighted avg       0.78      0.78      0.78    319926

Confusion Matrix: [[122836  37217]
 [ 34719 125154]]
AUC: 0.8554525310944979


#### Test case

In [19]:
test_cases = X_test.to_frame()
test_cases

Unnamed: 0,text_lemmatized
450663,riding next 4 day bummerrr camp httptweetsg
937593,sazchik try give people like ritalin strong st...
630808,missing corbear
734278,primevals cancelled apparently
580218,problogger paypal
...,...
231960,pleia2 really another os monitor product capab...
170757,dougiemcfly shit
1568653,beachballz awww thank nice
125193,interesting follow moment miserable


In [20]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text_lemmatized'])
prediction = clf.predict(test_tfidf)
prediction_prob = clf._predict_proba_lr(test_tfidf)[:,1]

test_cases['predict_prob'] = prediction_prob
test_cases['predicted sentiment'] = prediction

test_cases


Unnamed: 0,text_lemmatized,predict_prob,predicted sentiment
450663,riding next 4 day bummerrr camp httptweetsg,0.375005,0
937593,sazchik try give people like ritalin strong st...,0.627238,1
630808,missing corbear,0.070768,0
734278,primevals cancelled apparently,0.097156,0
580218,problogger paypal,0.647017,1
...,...,...,...
231960,pleia2 really another os monitor product capab...,0.237853,0
170757,dougiemcfly shit,0.459269,0
1568653,beachballz awww thank nice,0.787808,1
125193,interesting follow moment miserable,0.509785,1
