## Imports

In [1]:
import pandas as pd

import nltk
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

## Downloading dataset

In [2]:
nltk.download('movie_reviews')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/admin/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/admin/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/admin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
corpus = movie_reviews.raw()
corpus = corpus.split("\n")

## Part 1: Bag of Words

In [4]:
unigram_vectorizer = CountVectorizer(analyzer='word', stop_words="english", ngram_range=(1, 1))
bigram_vectorizer = CountVectorizer(analyzer='word', stop_words="english", ngram_range=(2, 2))

In [5]:
unigrams = unigram_vectorizer.fit_transform(corpus)
bigrams = bigram_vectorizer.fit_transform(corpus)

In [6]:
print(unigram_vectorizer.get_feature_names()[3000:3200])

['balk', 'balki', 'ball', 'ballad', 'ballads', 'ballard', 'ballentine', 'ballerina', 'ballet', 'balletic', 'ballhaus', 'ballinagra', 'balliol', 'ballisitic', 'ballistic', 'ballistics', 'balloon', 'ballooning', 'balloons', 'ballot', 'ballroom', 'balls', 'ballstein', 'ballyhoo', 'ballyhooed', 'balm', 'balmy', 'balrogs', 'balsan', 'balthazar', 'baltimore', 'balto', 'baltus', 'baluyev', 'bambi', 'bamboo', 'bamboozled', 'banal', 'banalities', 'banality', 'banana', 'bananas', 'bancroft', 'band', 'bandaged', 'bandages', 'bandanna', 'bandaras', 'banderas', 'banderes', 'bandied', 'bandies', 'bandit', 'banditos', 'bandits', 'bandmate', 'bands', 'bandstand', 'bandwagon', 'bane', 'bang', 'banged', 'bangers', 'bangkok', 'bangs', 'bangy', 'banish', 'banished', 'banishes', 'banishment', 'banisters', 'banji', 'banjo', 'banjos', 'bank', 'bankability', 'bankable', 'banker', 'bankers', 'banking', 'bankole', 'bankroll', 'bankrupt', 'bankruptcy', 'banks', 'banned', 'bannen', 'banner', 'banners', 'bannister

In [7]:
print(bigram_vectorizer.get_feature_names()[3000:3200])

['_disturbing_behavior_ frightening', '_do_ look', '_doctor_dolittle_ continues', '_does_ fully', '_does_ nearly', '_does_ sing', '_dog fancy_', '_don it_', '_don t_', '_double_team_ juicing', '_double_team_ let', '_double_team_ managed', '_double_team_ strange', '_dragon bruce', '_dragon_ jason', '_dragonheart_ gee', '_election good', '_election high', '_election_ contain', '_election_ disappointing', '_election_ matthew', '_election_ plot', '_election_ potential', '_election_ president', '_election_ rely', '_entertainment_weekly_ theresa', '_escape new', '_eve bayou_', '_everybody_ makes', '_exactly_ fuller', '_fear_and_loathing_in_las_vegas_ disastrously', '_ferris bueller_', '_fifty_ whispers', '_film freak', '_flirting disaster_', '_four_ credited', '_full_house_ _america', '_gag spoon_', '_gattaca_ _the', '_gattaca_ overshadowed', '_genius_ wants', '_ghost shell_', '_great_ supporting', '_h20_ _scream_2_', '_halloween _h20_', '_halloween_ _the', '_happen_ carefully', '_hard_ware 

## Part 2: TF-IDF

In [8]:
tfidf_unigram_vec = TfidfVectorizer(analyzer="word", stop_words="english", ngram_range=(1,1))
tfidf_bigram_vec = TfidfVectorizer(analyzer="word", stop_words="english", ngram_range=(2,2))

In [9]:
tfidf_unigrams = tfidf_unigram_vec.fit_transform(corpus)
tfidf_bigrams = tfidf_bigram_vec.fit_transform(corpus)

In [10]:
print(tfidf_unigram_vec.get_feature_names()[3000:3200])

['balk', 'balki', 'ball', 'ballad', 'ballads', 'ballard', 'ballentine', 'ballerina', 'ballet', 'balletic', 'ballhaus', 'ballinagra', 'balliol', 'ballisitic', 'ballistic', 'ballistics', 'balloon', 'ballooning', 'balloons', 'ballot', 'ballroom', 'balls', 'ballstein', 'ballyhoo', 'ballyhooed', 'balm', 'balmy', 'balrogs', 'balsan', 'balthazar', 'baltimore', 'balto', 'baltus', 'baluyev', 'bambi', 'bamboo', 'bamboozled', 'banal', 'banalities', 'banality', 'banana', 'bananas', 'bancroft', 'band', 'bandaged', 'bandages', 'bandanna', 'bandaras', 'banderas', 'banderes', 'bandied', 'bandies', 'bandit', 'banditos', 'bandits', 'bandmate', 'bands', 'bandstand', 'bandwagon', 'bane', 'bang', 'banged', 'bangers', 'bangkok', 'bangs', 'bangy', 'banish', 'banished', 'banishes', 'banishment', 'banisters', 'banji', 'banjo', 'banjos', 'bank', 'bankability', 'bankable', 'banker', 'bankers', 'banking', 'bankole', 'bankroll', 'bankrupt', 'bankruptcy', 'banks', 'banned', 'bannen', 'banner', 'banners', 'bannister

## Part 3: Naive Bayes Classification

### Creating a 2D Array to store (review,category)

In [12]:
document = [(list(movie_reviews.words(fileid)), category)       
                for category in movie_reviews.categories()            
                for fileid in movie_reviews.fileids(category)]

### Creating Dataframe to preprocess reviews as sentences and assign numeric values to categories

In [None]:
df_movie_reviews= pd.DataFrame(document,columns=['movie_review','category'])
df_movie_reviews['category_ind'] = df_movie_reviews.category.map({'neg':0, 'pos':1})
df_movie_reviews.drop(columns=['category'],inplace =True)

In [14]:
df_movie_reviews["movie_review"] = df_movie_reviews["movie_review"].apply(lambda x: " ".join(x))
df_movie_reviews.head()

Unnamed: 0,movie_review,category_ind
0,"plot : two teen couples go to a church party ,...",0
1,the happy bastard ' s quick movie review damn ...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' first...",0
4,synopsis : a mentally unstable man undergoing ...,0


### Splitting data in train and test

In [15]:
X_train,X_test,y_train,y_test = train_test_split(df_movie_reviews["movie_review"],
                                                 df_movie_reviews["category_ind"], 
                                                 test_size=0.2, 
                                                 random_state=42)

### BOW Vectorizer

In [16]:
bow_unigram_vectorizer = CountVectorizer(analyzer='word', stop_words="english", ngram_range=(1, 1))
bow_bigram_vectorizer = CountVectorizer(analyzer='word', stop_words="english", ngram_range=(2, 2))

### TF-IDF Vectorizer

In [17]:
tfidf_unigram_vec = TfidfVectorizer(analyzer="word", stop_words="english", ngram_range=(1,1))
tfidf_bigram_vec = TfidfVectorizer(analyzer="word", stop_words="english", ngram_range=(2,2))

### Creating unigrams and bigrams using BOW Vectorizer and TF-IDF Vectorizer

In [18]:
X_train_bow_unigrams = bow_unigram_vectorizer.fit_transform(X_train)
X_train_bow_bigrams = bow_bigram_vectorizer.fit_transform(X_train)

X_test_bow_unigrams = bow_unigram_vectorizer.transform(X_test)
X_test_bow_bigrams = bow_bigram_vectorizer.transform(X_test)

X_train_tfidf_unigrams = tfidf_unigram_vec.fit_transform(X_train)
X_train_tfidf_bigrams = tfidf_bigram_vec.fit_transform(X_train)
X_test_tfidf_unigrams = tfidf_unigram_vec.transform(X_test)
X_test_tfidf_bigrams = tfidf_bigram_vec.transform(X_test)

### Picking best parameters using Randomized Search CV and evaluating Naiye Bayes using Precision, Recall and F1

In [22]:
NB_clf = MultinomialNB()
param_grid = [
              {'alpha': [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1], 
               'fit_prior': ['True', 'False']}
              ]
scores = ["precision","recall","f1"]
for score in scores:
    search = RandomizedSearchCV(estimator = NB_clf, param_distributions=param_grid, scoring=score)
    search.fit(X_train_bow_unigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("Naive Bayes Classification report: BOW Unigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_bow_unigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'fit_prior': 'False', 'alpha': 0.7}

Naive Bayes Classification report: BOW Unigrams

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       199
           1       0.83      0.75      0.79       201

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.80      0.80      0.80       400


Score: recall, Best_params: {'fit_prior': 'False', 'alpha': 0.9}

Naive Bayes Classification report: BOW Unigrams

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       199
           1       0.84      0.76      0.79       201

    accuracy                           0.80       400
   macro avg       0.81      0.80      0.80       400
weighted avg       0.81      0.80      0.80       400


Score: f1, Best_params: {'fit_prior': 'True', 'alpha': 1}

Naive Bayes Classification report: BOW Unigrams

   

In [23]:
for score in scores:
    search = RandomizedSearchCV(estimator = NB_clf, param_distributions=param_grid, scoring=score)
    search.fit(X_train_bow_bigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("Naive Bayes Classification report: BOW Bigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_bow_bigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'fit_prior': 'True', 'alpha': 0.6}

Naive Bayes Classification report: BOW Bigrams

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       199
           1       0.80      0.75      0.77       201

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400


Score: recall, Best_params: {'fit_prior': 'True', 'alpha': 0.0}

Naive Bayes Classification report: BOW Bigrams

              precision    recall  f1-score   support

           0       0.76      0.68      0.72       199
           1       0.71      0.79      0.75       201

    accuracy                           0.73       400
   macro avg       0.74      0.73      0.73       400
weighted avg       0.73      0.73      0.73       400


Score: f1, Best_params: {'fit_prior': 'True', 'alpha': 0.0}

Naive Bayes Classification report: BOW Bigrams

      

In [24]:
for score in scores:
    search = RandomizedSearchCV(estimator = NB_clf, param_distributions=param_grid, scoring=score)
    search.fit(X_train_tfidf_unigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("Naive Bayes Classification report: TFIDF Unigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_tfidf_unigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'fit_prior': 'True', 'alpha': 1}

Naive Bayes Classification report: TFIDF Unigrams

              precision    recall  f1-score   support

           0       0.77      0.85      0.81       199
           1       0.84      0.75      0.79       201

    accuracy                           0.80       400
   macro avg       0.81      0.80      0.80       400
weighted avg       0.81      0.80      0.80       400


Score: recall, Best_params: {'fit_prior': 'True', 'alpha': 0.3}

Naive Bayes Classification report: TFIDF Unigrams

              precision    recall  f1-score   support

           0       0.78      0.85      0.81       199
           1       0.84      0.76      0.80       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.80       400
weighted avg       0.81      0.81      0.80       400


Score: f1, Best_params: {'fit_prior': 'True', 'alpha': 0.8}

Naive Bayes Classification report: TFIDF Unigrams


In [25]:
for score in scores:
    search = RandomizedSearchCV(estimator = NB_clf, param_distributions=param_grid, scoring=score)
    search.fit(X_train_tfidf_bigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("Naive Bayes Classification report: TFIDF Bigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_tfidf_bigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'fit_prior': 'True', 'alpha': 0.1}

Naive Bayes Classification report: TFIDF Bigrams

              precision    recall  f1-score   support

           0       0.76      0.81      0.79       199
           1       0.80      0.75      0.77       201

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400


Score: recall, Best_params: {'fit_prior': 'True', 'alpha': 1}

Naive Bayes Classification report: TFIDF Bigrams

              precision    recall  f1-score   support

           0       0.75      0.81      0.78       199
           1       0.80      0.73      0.76       201

    accuracy                           0.77       400
   macro avg       0.77      0.77      0.77       400
weighted avg       0.77      0.77      0.77       400


Score: f1, Best_params: {'fit_prior': 'False', 'alpha': 0.0}

Naive Bayes Classification report: TFIDF Bigrams

 

## Part 4 : SVM with Linear Kernel

In [26]:
SVC_model = SVC(kernel='linear',random_state =42)
param_grid = {'C': [1,0.2,5,10]}

### Picking best parameters using Randomized Search CV and evaluating SVM Classifier using Precision, Recall and F1

In [27]:
for score in scores:
    search = RandomizedSearchCV(estimator = SVC_model, param_distributions=param_grid, scoring=score, n_iter=20)
    search.fit(X_train_bow_unigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("SVM Classification report: BOW Unigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_bow_unigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'C': 1}

SVM Classification report: BOW Unigrams

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       199
           1       0.82      0.82      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400


Score: recall, Best_params: {'C': 1}

SVM Classification report: BOW Unigrams

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       199
           1       0.82      0.82      0.82       201

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400


Score: f1, Best_params: {'C': 1}

SVM Classification report: BOW Unigrams

              precision    recall  f1-score   support

           0       0.82      0.82      0.82       199

In [29]:
for score in scores:
    search = RandomizedSearchCV(estimator = SVC_model, param_distributions=param_grid, scoring=score, n_iter=20)
    search.fit(X_train_bow_bigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("SVM Classification report: BOW Bigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_bow_bigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'C': 1}

SVM Classification report: BOW Bigrams

              precision    recall  f1-score   support

           0       0.69      0.85      0.77       199
           1       0.81      0.63      0.71       201

    accuracy                           0.74       400
   macro avg       0.75      0.74      0.74       400
weighted avg       0.75      0.74      0.74       400


Score: recall, Best_params: {'C': 1}

SVM Classification report: BOW Bigrams

              precision    recall  f1-score   support

           0       0.69      0.85      0.77       199
           1       0.81      0.63      0.71       201

    accuracy                           0.74       400
   macro avg       0.75      0.74      0.74       400
weighted avg       0.75      0.74      0.74       400


Score: f1, Best_params: {'C': 1}

SVM Classification report: BOW Bigrams

              precision    recall  f1-score   support

           0       0.69      0.85      0.77       199
  

In [30]:
for score in scores:
    search = RandomizedSearchCV(estimator = SVC_model, param_distributions=param_grid, scoring=score, n_iter=20)
    search.fit(X_train_tfidf_unigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("SVM Classification report: TFIDF Unigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_tfidf_unigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'C': 5}

SVM Classification report: TFIDF Unigrams

              precision    recall  f1-score   support

           0       0.83      0.84      0.83       199
           1       0.84      0.83      0.83       201

    accuracy                           0.83       400
   macro avg       0.84      0.84      0.83       400
weighted avg       0.84      0.83      0.83       400


Score: recall, Best_params: {'C': 0.2}

SVM Classification report: TFIDF Unigrams

              precision    recall  f1-score   support

           0       0.82      0.73      0.77       199
           1       0.76      0.84      0.80       201

    accuracy                           0.79       400
   macro avg       0.79      0.78      0.78       400
weighted avg       0.79      0.79      0.78       400


Score: f1, Best_params: {'C': 5}

SVM Classification report: TFIDF Unigrams

              precision    recall  f1-score   support

           0       0.83      0.84      0.83  

In [31]:
for score in scores:
    search = RandomizedSearchCV(estimator = SVC_model, param_distributions=param_grid, scoring=score, n_iter=20)
    search.fit(X_train_tfidf_bigrams, y_train)
    print(f"Score: {score}, Best_params: {search.best_params_}")
    print()
    print("SVM Classification report: TFIDF Bigrams")
    print()
    y_true, y_pred = y_test, search.predict(X_test_tfidf_bigrams)
    print(classification_report(y_true, y_pred))
    print()

Score: precision, Best_params: {'C': 1}

SVM Classification report: TFIDF Bigrams

              precision    recall  f1-score   support

           0       0.78      0.77      0.78       199
           1       0.78      0.79      0.78       201

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400


Score: recall, Best_params: {'C': 1}

SVM Classification report: TFIDF Bigrams

              precision    recall  f1-score   support

           0       0.78      0.77      0.78       199
           1       0.78      0.79      0.78       201

    accuracy                           0.78       400
   macro avg       0.78      0.78      0.78       400
weighted avg       0.78      0.78      0.78       400


Score: f1, Best_params: {'C': 1}

SVM Classification report: TFIDF Bigrams

              precision    recall  f1-score   support

           0       0.78      0.77      0.78       

## Observations
* For Naive-Bayes, Unigrams and Precision Metric give the best results with Precision score of 0.81. Rest fall in the range of (0.71-0.79)
* For SVM, TFIDF Unigrams with C=5 proves to be best model, as precision and f1 give score of 0.84.