In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np

In [2]:
file_path = '../Sup/genres.csv'
dataset = pd.read_csv(file_path)

dataset.head()

Unnamed: 0,review,genre,sentiment
0,Confidently directed dark brooding and pack...,Action,Positive
1,Nolans have given the character this great man...,Action,Positive
2,there is action explosions and stunt work but...,Action,Positive
3,The acting story atmosphere and actions scenes...,Action,Positive
4,I recommend it to anyone who likes action movi...,Action,Positive


In [3]:
dataset['binary'] = dataset['sentiment'].apply(lambda x: 1 if x=='Positive' else 0)

In [4]:
#reviews = dataset['review'].apply(lambda x:BeautifulSoup(x,'lxml').text)

In [5]:
data = dataset.loc[:,['review','binary']]

In [6]:
from sklearn.model_selection import train_test_split
train,test = train_test_split(data,test_size=0.1,random_state=1)
X_train = train['review'].values
X_test = test['review'].values
y_train = train['binary']
y_test = test['binary']

In [7]:
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

en_stopwords = set(stopwords.words("english"))

from sklearn.feature_extraction.text import CountVectorizer

vectorize = CountVectorizer(analyzer='word',
                            tokenizer=word_tokenize,
                            lowercase=True,
                            ngram_range=(1,1),
                            stop_words=en_stopwords)



In [8]:
from sklearn.model_selection import StratifiedKFold

kFold = StratifiedKFold(n_splits=5,shuffle=True,random_state=1)

In [9]:
from sklearn.pipeline import make_pipeline
from sklearn.svm import SVC
np.random.seed(42);

pipeline_svm = make_pipeline(vectorize,
                             SVC(probability=True,
                                 kernel='linear',
                                 class_weight='balanced'))


In [10]:
from sklearn.model_selection import GridSearchCV

svm_grid = GridSearchCV(pipeline_svm,
                        param_grid= {'svc__C' : [0.01,0.1,1]},
                        cv=kFold,
                        scoring='roc_auc',
                        verbose=1,
                        n_jobs=1)

In [11]:
svm_grid.fit(X_train,y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
  'stop_words.' % sorted(inconsistent))
[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:    5.3s finished
  'stop_words.' % sorted(inconsistent))


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=1, shuffle=True),
             estimator=Pipeline(steps=[('countvectorizer',
                                        CountVectorizer(stop_words={'a',
                                                                    'about',
                                                                    'above',
                                                                    'after',
                                                                    'again',
                                                                    'against',
                                                                    'ain',
                                                                    'all', 'am',
                                                                    'an', 'and',
                                                                    'any',
                                                                    'are',
                   

In [12]:
svm_grid.score(X_test,y_test)

0.9243697478991597

In [13]:
print(svm_grid.best_index_)
print(svm_grid.best_params_)
print(svm_grid.best_score_)

model = svm_grid.best_estimator_

1
{'svc__C': 0.1}
0.9128345520480352


In [14]:
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import confusion_matrix, roc_auc_score, recall_score, precision_score

pred_proba = model.predict_proba(X_test)[:,1]
pred = model.predict(X_test)

y = y_test
auc = roc_auc_score(y, pred_proba)
acc = accuracy_score(y, pred)
f1 = f1_score(y, pred)
prec = precision_score(y, pred)
rec = recall_score(y, pred)
result = {'auc': auc, 'f1': f1, 'acc': acc, 'precision': prec, 'recall': rec}
print(result)

{'auc': 0.923969587835134, 'f1': 0.86, 'acc': 0.86, 'precision': 0.8775510204081632, 'recall': 0.8431372549019608}


In [15]:
import pickle
with open('../Sup/sen_svm_model1.0.sav','wb') as pf:
    pickle.dump(svm_grid,pf)