In [3]:
##Importing IMDB Dataset and cleaning reviews

#Importing libraries
import nltk
import pandas as pd
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\merhbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\merhbaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
#Importing dataset and replacing labels with 0 and 1 for classification
df = pd.read_csv('IMDB Dataset.csv', encoding = 'Latin-1')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [7]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
49995,I thought this movie did a down right good job...,1
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0
49997,I am a Catholic taught in parochial elementary...,0
49998,I'm going to have to disagree with the previou...,0


In [8]:
df['review'][0]

"One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would say the main appeal of the show is due to the fa

In [9]:
#Defining stop_words and lemmatizer
stop_words = set(stopwords.words("english")) 
lemmatizer = WordNetLemmatizer()

In [10]:
#Removing the html strips
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

In [11]:
#Defining clean_text function
def clean_text(text):
    text = strip_html(text)
    text = re.sub(r'[^A-Za-z0-9]+',' ',text)
    text = text.lower()
    text = [lemmatizer.lemmatize(token) for token in text.split(" ")]
    text = [lemmatizer.lemmatize(token, "v") for token in text]
    text = [word for word in text if not word in stop_words]
    text = " ".join(text)
    return text

In [12]:
#Creating new column for processed reviews
df['Processed_Reviews'] = df.review.apply(lambda x: clean_text(x))

In [13]:
df

Unnamed: 0,review,sentiment,Processed_Reviews
0,One of the other reviewers has mentioned that ...,1,one reviewer ha mention watch 1 oz episode hoo...
1,A wonderful little production. <br /><br />The...,1,wonderful little production film technique una...
2,I thought this was a wonderful way to spend ti...,1,think wa wonderful way spend time hot summer w...
3,Basically there's a family where a little boy ...,0,basically family little boy jake think zombie ...
4,"Petter Mattei's ""Love in the Time of Money"" is...",1,petter mattei love time money visually stun fi...
...,...,...,...
49995,I thought this movie did a down right good job...,1,think movie right good job creative original f...
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",0,bad plot bad dialogue bad act idiotic direct a...
49997,I am a Catholic taught in parochial elementary...,0,catholic teach parochial elementary school nun...
49998,I'm going to have to disagree with the previou...,0,go disagree previous comment side maltin one s...


In [14]:
df['Processed_Reviews'][0]

'one reviewer ha mention watch 1 oz episode hook right exactly happen first thing strike oz wa brutality unflinching scene violence set right word go trust show faint hearted timid show pull punch regard drug sex violence hardcore classic use word call oz nickname give oswald maximum security state penitentary focus mainly emerald city experimental section prison cell glass front face inwards privacy high agenda em city home many aryan muslim gangsta latino christian italian irish scuffle death stare dodgy deal shady agreement never far away would say main appeal show due fact go show dare forget pretty picture paint mainstream audience forget charm forget romance oz mess around first episode ever saw strike nasty wa surreal say wa ready watch develop taste oz get accustom high level graphic violence violence injustice crook guard sell nickel inmate kill order get away well mannered middle class inmate turn prison bitch due lack street skill prison experience watch oz may become comfor

# Training

In [15]:
##Deploying SVM model on available data

#Importing libraries
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pickle

In [16]:
#Defining input and target variable
x = df['Processed_Reviews']
y = df['sentiment']

In [22]:
#Training and splitting
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [23]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer().fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

In [30]:
bow_train.shape

(40000, 83866)

In [31]:
#instantiate the model (using the default parameters)
SVM = SVC()

In [32]:
# fit the model with pre-processed data
SVM.fit(bow_train, y_train)

SVC()

In [33]:
#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.89      0.85      0.87      5035
           1       0.86      0.89      0.87      4965

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



# gridSearch

In [12]:
#Importing libraries
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score

In [13]:
#Creating a Pipeline
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('SVM', SVC())
])

In [22]:
#Defining hyperparameters
parameters = {
    'vect__max_df':[0.3,0.4,0.5,0.6,0.7],
    'vect__ngram_range':  [(1,1), (1,2), (1,3)],
    'SVM__kernel': ['poly', 'rbf', 'sigmoid'],
    'SVM__C': [50, 10, 1.0]}

In [23]:
# define grid search
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
grid_search = GridSearchCV(pipeline, param_grid=parameters, refit = True, verbose = 3)
grid_result = grid_search.fit(df.loc[:7000, 'Processed_Reviews'].values.astype('U'), df.loc[:7000, 'sentiment'].values.astype('U'))

Fitting 5 folds for each of 135 candidates, totalling 675 fits
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.770 total time= 1.0min
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.793 total time= 1.0min
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.731 total time= 1.0min
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.791 total time= 1.0min
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.764 total time=  51.4s
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 2);, score=0.652 total time= 1.8min
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 2);, score=0.676 total time= 1.6min
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.3, vect__ngram_range=(1, 2);, score=0.630 total time= 

[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.713 total time= 1.9min
[CV 1/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.680 total time= 2.5min
[CV 2/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.686 total time= 2.4min
[CV 3/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.647 total time= 2.4min
[CV 4/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.699 total time= 2.4min
[CV 5/5] END SVM__C=50, SVM__kernel=poly, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.666 total time= 2.5min
[CV 1/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.851 total time=  49.7s
[CV 2/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.3, vect__ngram_range=(1, 1);, score=0.839 total time=  49.3s
[CV 3/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.3,

[CV 5/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.850 total time=  53.6s
[CV 1/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.859 total time= 1.6min
[CV 2/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.861 total time= 1.6min
[CV 3/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.854 total time= 1.6min
[CV 4/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.876 total time= 1.6min
[CV 5/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.858 total time= 1.6min
[CV 1/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.864 total time= 2.1min
[CV 2/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect__ngram_range=(1, 3);, score=0.866 total time= 2.1min
[CV 3/5] END SVM__C=50, SVM__kernel=rbf, vect__max_df=0.7, vect_

[CV 4/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.697 total time=  47.2s
[CV 5/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.721 total time=  47.3s
[CV 1/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.664 total time=  21.7s
[CV 2/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.666 total time=  21.6s
[CV 3/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.695 total time=  21.8s
[CV 4/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.664 total time=  21.4s
[CV 5/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 1);, score=0.673 total time=  21.3s
[CV 1/5] END SVM__C=50, SVM__kernel=sigmoid, vect__max_df=0.7, vect__ngram_range=(1, 2);, score=0.692 total time=  36.9s
[CV 2/5] END SVM__C=50, SVM__ker

[CV 3/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.724 total time= 1.7min
[CV 4/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.791 total time= 1.8min
[CV 5/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.763 total time= 1.6min
[CV 1/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.669 total time= 2.1min
[CV 2/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.735 total time= 2.1min
[CV 3/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.691 total time= 2.3min
[CV 4/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.753 total time= 2.2min
[CV 5/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0.6, vect__ngram_range=(1, 3);, score=0.714 total time= 2.3min
[CV 1/5] END SVM__C=10, SVM__kernel=poly, vect__max_df=0

[CV 3/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.834 total time=  52.6s
[CV 4/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.859 total time=  52.6s
[CV 5/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.846 total time=  53.1s
[CV 1/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.859 total time= 1.6min
[CV 2/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.862 total time= 1.6min
[CV 3/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.848 total time= 1.6min
[CV 4/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.875 total time= 1.6min
[CV 5/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect__ngram_range=(1, 2);, score=0.858 total time= 1.6min
[CV 1/5] END SVM__C=10, SVM__kernel=rbf, vect__max_df=0.6, vect_

[CV 2/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.779 total time= 1.0min
[CV 3/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.799 total time= 1.0min
[CV 4/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.784 total time= 1.0min
[CV 5/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.790 total time= 1.1min
[CV 1/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.683 total time=  21.5s
[CV 2/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.684 total time=  21.4s
[CV 3/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.696 total time=  21.5s
[CV 4/5] END SVM__C=10, SVM__kernel=sigmoid, vect__max_df=0.6, vect__ngram_range=(1, 1);, score=0.681 total time=  21.3s
[CV 5/5] END SVM__C=10, SVM__ker

[CV 1/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.667 total time= 1.6min
[CV 2/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.621 total time= 1.6min
[CV 3/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.676 total time= 1.6min
[CV 4/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.592 total time= 1.6min
[CV 5/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.747 total time= 1.6min
[CV 1/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.673 total time= 2.1min
[CV 2/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.545 total time= 2.2min
[CV 3/5] END SVM__C=1.0, SVM__kernel=poly, vect__max_df=0.5, vect__ngram_range=(1, 3);, score=0.571 total time= 2.2min
[CV 4/5] END SVM__C=1.0, SVM__kernel=poly, vect_

[CV 1/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.850 total time=  45.6s
[CV 2/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.853 total time=  45.1s
[CV 3/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.839 total time=  45.2s
[CV 4/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.853 total time=  45.1s
[CV 5/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.854 total time=  45.5s
[CV 1/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.853 total time= 1.5min
[CV 2/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.858 total time= 1.5min
[CV 3/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0.5, vect__ngram_range=(1, 2);, score=0.841 total time= 1.5min
[CV 4/5] END SVM__C=1.0, SVM__kernel=rbf, vect__max_df=0

[CV 5/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 2);, score=0.864 total time= 1.0min
[CV 1/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 3);, score=0.864 total time= 1.5min
[CV 2/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 3);, score=0.869 total time= 1.5min
[CV 3/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 3);, score=0.859 total time= 1.5min
[CV 4/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 3);, score=0.868 total time= 1.5min
[CV 5/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.4, vect__ngram_range=(1, 3);, score=0.869 total time= 1.5min
[CV 1/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.794 total time=  28.5s
[CV 2/5] END SVM__C=1.0, SVM__kernel=sigmoid, vect__max_df=0.5, vect__ngram_range=(1, 1);, score=0.790 total time=  28.6s
[CV 3/5] END SVM__C=1.0,

In [24]:
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.866162 using {'SVM__C': 1.0, 'SVM__kernel': 'sigmoid', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 3)}
0.769890 (0.022617) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 1)}
0.653335 (0.017755) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 2)}
0.582632 (0.013588) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.3, 'vect__ngram_range': (1, 3)}
0.771604 (0.019507) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.4, 'vect__ngram_range': (1, 1)}
0.682617 (0.016389) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.4, 'vect__ngram_range': (1, 2)}
0.611485 (0.013326) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.4, 'vect__ngram_range': (1, 3)}
0.777460 (0.019132) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.5, 'vect__ngram_range': (1, 1)}
0.699757 (0.022108) with: {'SVM__C': 50, 'SVM__kernel': 'poly', 'vect__max_df': 0.5, 'vec

# trainingFinal

In [13]:
#Defining input and target variable
X = df['Processed_Reviews']
y = df['sentiment']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [21]:
#Vectorization and Bag of words method with default parameters
count_vect = CountVectorizer(max_df = 0.6, ngram_range = (1, 2)).fit(df['Processed_Reviews'].values.astype('U'))
bow_train = count_vect.transform(X_train.values.astype('U'))
bow_test = count_vect.transform(X_test.values.astype('U'))

In [23]:
SVM = SVC(C=50, kernel='rbf')

In [24]:
SVM.fit(bow_train, y_train)

SVC(C=50)

In [25]:
#perform classification and prediction on samples in tf_test
predicted_SVM = SVM.predict(bow_test)
print(classification_report(y_test, predicted_SVM))

              precision    recall  f1-score   support

           0       0.90      0.87      0.88      5035
           1       0.87      0.90      0.89      4965

    accuracy                           0.88     10000
   macro avg       0.88      0.88      0.88     10000
weighted avg       0.88      0.88      0.88     10000



In [30]:
#saving the model
pickle.dump(SVM, open('SVM_model', 'wb'))

In [19]:
pickle.dump(count_vect, open('count_vect', 'wb'))

In [None]:
#loading the model
#loaded_SVM = pickle.load(open('SVM_model', 'rb'))

# Test

In [26]:
test = ['useless']
test = count_vect.transform(test).toarray()
print(SVM.predict(test))

[0]


In [51]:
test = ['good movie']
test = count_vect.transform(test).toarray()
print(loaded_SVM.predict(test))

[1]
