# Support vector machine

 support vector machine is a supervised machine learning approach works good for binar classification problems.

In [1]:
import pandas as pd
tweets = pd.read_csv('combined_final_14may.csv', sep=',')
tweets.head()
#len(tweets)

54093

#  Filter neutral tweets

Select only positive and negative tweets for SVM model

In [4]:
tweets_posNeg =tweets[(tweets.polarityNum == 1.0) | (tweets.polarityNum == -1.0)]

In [11]:
#import the required sklearn packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import SVC, LinearSVC
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

In [6]:
#80% train data, 10% validation data, 10% test data
train, validate_test = train_test_split(tweets_posNeg, test_size=0.2, random_state=1)
validate, test = train_test_split(validate_test, test_size=0.5, random_state=1)

X_train = train['text_preprocessed'].values
X_validate = validate['text_preprocessed'].values
X_test = test['text_preprocessed'].values
y_train = train['polarityNum']
y_validate = validate['polarityNum']
y_test = test['polarityNum']

# CountVectorizer to convert text into vectors

In [7]:
vectorizer = CountVectorizer(
    analyzer = 'word',
    ngram_range=(1, 2))

# TFIDF vectorizer

In [18]:
v = TfidfVectorizer(norm='l2',min_df=0, use_idf=True, smooth_idf=True, sublinear_tf=True, stop_words='english', analyzer='word',  ngram_range=(1, 1))
train_features_model = v.fit(X_train)
test_features = v.transform(X_test)
validate_features =  v.transform(X_validate)
train_features = v.fit_transform(X_train)

In [13]:
kfolds = StratifiedKFold(n_splits = 5, shuffle=True, random_state=7) # kfolds cross validation

# SVM with parameter tuning

In [14]:
#with count vectorizer
import numpy as np
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer,
                             SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1,5, 10]}, 
                    cv = kfolds,
                    scoring="f1_macro",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(X_train, y_train)
grid_svm.score(X_train, y_train)
grid_svm.score(X_validate, y_validate)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 320.9min finished


0.8361425918446617

In [21]:
#with Tfidf vectorizer
import numpy as np
np.random.seed(1)

pipeline_svm = make_pipeline(SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'C': [0.01, 0.1, 1,5, 10]}, 
                    cv = kfolds,
                    scoring="f1_macro",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(train_features, y_train)
grid_svm.score(validate_features, y_validate)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


[Parallel(n_jobs=-1)]: Done  25 out of  25 | elapsed: 97.7min finished


0.8214707701955565

In [15]:
grid_svm.best_params_

{'svc__C': 0.1}

In [16]:
grid_svm.best_score_

0.8411454016175527

# Results on test dataset

In [17]:
def report_results(model, X, y):
    pred_proba = model.predict_proba(X)[:, 1]
    pred = model.predict(X)        

    auc = roc_auc_score(y, pred_proba)
    acc = accuracy_score(y, pred)
    f1 = f1_score(y, pred)
    prec = precision_score(y, pred)
    rec = recall_score(y, pred)
    result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
    return result

In [19]:
report_results(grid_svm.best_estimator_, X_test, y_test)

{'auc': 0.9182542594177818,
 'f1': 0.8399311531841652,
 'acc': 0.845,
 'precision': 0.8472222222222222,
 'recall': 0.8327645051194539}

In [52]:
pred = grid_svm.best_estimator_.predict(test_features)

# Demo

Few examples of SVM predictions using countvectorizer

In [20]:
grid_svm.predict([" with samsung is always a great experience"])

array([1.])

In [23]:
grid_svm.predict_proba([" with samsung is always a great experience"])[0][1]

0.89425705275774

In [30]:
grid_svm.predict([" i hate iphone"])

array([-1.])

In [29]:
grid_svm.predict_proba([" i hate iphone"])[0][1]

0.18613973904431746

In [31]:
grid_svm.predict_proba(["Any phone with a lot of gb are available, but I prefer Iphone! I know you will do the best of the best in whatever you choose"])[0][1]

0.9778101530414561