# Support Vector Machines Classifier (SVM)

In [33]:
import pandas as pd

### Import data

In [34]:
data = pd.read_csv('../data_cleaning/preprocessed_train_data.csv')
# data = data[['text_lemmatized','sentiment']]
data = data.dropna()
data

Unnamed: 0.1,Unnamed: 0,sentiment,text
0,0,-1,awww bummer shoulda got david carr third day
1,1,-1,upset update facebook texting might cry result...
2,2,-1,dived many times ball managed save 50 rest go ...
3,3,-1,whole body feels itchy like fire
4,4,-1,behaving mad see
...,...,...,...
1599995,1599995,1,woke school best feeling ever
1599996,1599996,1,thewdb com cool hear old walt interviews
1599997,1599997,1,ready mojo makeover ask details
1599998,1599998,1,happy 38th birthday boo alll time tupac amaru ...


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np
import nltk

### TF-IDF

In [36]:
X_train, X_test, y_train, y_test = train_test_split(data["text"],data["sentiment"],test_size=0.2,shuffle=True)

In [37]:
tfidf_vectorizer = TfidfVectorizer(use_idf=True)
X_train_vectors_tfidf = tfidf_vectorizer.fit_transform(X_train) 
X_test_vectors_tfidf = tfidf_vectorizer.transform(X_test)

### Support Vector Machines (SVM) Classifier

#### Training

In [38]:
clf = LinearSVC(random_state=0)

In [39]:
clf.fit(X_train_vectors_tfidf,y_train)

#### Results on 1000 Evaluation Dataset

In [67]:
test_cases = pd.read_csv('../data_cleaning/preprocessed_eval_data.csv')
test_cases

Unnamed: 0.1,Unnamed: 0,text,sentiment
0,0,move completely different location costs hundr...,0
1,1,started sharing information move another count...,1
2,2,parents made tough decision move family anothe...,-1
3,3,aye tryna head wild googles get visa move anot...,0
4,4,val going move another country cooking opinion...,-1
...,...,...,...
1265,1265,separation anxiety gavin bradley tracks experi...,1
1266,1266,crippling asian elder daughter feelings chokes...,0
1267,1267,phone dad last night asking job going amp dati...,-1
1268,1268,may qualify take advantage two possible tax ex...,0


In [68]:
test_tfidf = tfidf_vectorizer.transform(test_cases['text'])
prediction_prob = clf._predict_proba_lr(test_tfidf)[:,1]
prediction = clf.predict(test_tfidf)

In [69]:
final_prediction = []
for i in prediction_prob:
    if i > 0.55:
        final_prediction.append(1)
    elif i < 0.45:
        final_prediction.append(-1)
    else:
        final_prediction.append(0)

final_prediction_arr = np.array(final_prediction)

In [70]:
eval_ground_truth = test_cases['sentiment'].to_numpy()

In [71]:
print(classification_report(eval_ground_truth,final_prediction_arr))
print('Confusion Matrix:\n',confusion_matrix(eval_ground_truth, final_prediction_arr))

              precision    recall  f1-score   support

          -1       0.45      0.74      0.56       451
           0       0.62      0.39      0.48       608
           1       0.38      0.26      0.30       211

    accuracy                           0.49      1270
   macro avg       0.48      0.46      0.45      1270
weighted avg       0.52      0.49      0.48      1270

Confusion Matrix:
 [[333  90  28]
 [309 237  62]
 [103  54  54]]


### Result on Split Test Data

In [72]:
test_tfidf = tfidf_vectorizer.transform(X_test)
prediction_prob = clf._predict_proba_lr(test_tfidf)[:,1]
prediction = clf.predict(test_tfidf)

In [73]:
print(classification_report(y_test,prediction))
print('Confusion Matrix:\n',confusion_matrix(y_test, prediction))

              precision    recall  f1-score   support

          -1       0.78      0.76      0.77    159730
           1       0.76      0.79      0.78    158736

    accuracy                           0.77    318466
   macro avg       0.77      0.77      0.77    318466
weighted avg       0.77      0.77      0.77    318466

Confusion Matrix:
 [[121184  38546]
 [ 33488 125248]]
