In [1]:
import numpy as np
import re
import nltk
from sklearn.datasets import load_files #To load text files with categories as subfolder names.

In [2]:
#Loading Review files
movie_data = load_files(r"C:\Users\mahaa\Desktop\review_polarity.tar\txt_sentoken",encoding="utf-8")

In [3]:
X, y = movie_data.data, movie_data.target

In [4]:
#Loading nltk libraries and downloading nltk data
from nltk.tokenize import word_tokenize
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mahaa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
#Pre-process the review files and store in the variable 'documents'
documents=[]
for doc in range(0,len(X)):
    document = re.sub(r'\W', ' ', str(X[doc]))
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
    document = re.sub(r'\s+', ' ', document, flags=re.I)
    word_tokens=word_tokenize(document)
    document=[lemmatizer.lemmatize(word) for word in word_tokens]
    document = ' '.join(document)
    documents.append(document)

In [6]:
#Building Bag of Words model with Unigrams using CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
'''max_features=15000 was approximated from the research paper published by the data owner at 
https://www.cs.cornell.edu/home/llee/papers/sentiment.pdf'''
from nltk.corpus import stopwords
vectorizer = CountVectorizer(max_features=15000,max_df=0.5,stop_words=stopwords.words('english'))

In [8]:
documents_as_array = vectorizer.fit_transform(documents).toarray()

In [None]:
vectorizer.get_feature_names()

In [10]:
#Apply TF-IDF transform
from sklearn.feature_extraction.text import TfidfTransformer

In [11]:
tfidfconverter = TfidfTransformer()
documents_as_array_tfidf= tfidfconverter.fit_transform(documents_as_array).toarray()

In [12]:
#Building Machine Learning classifier using Linear SVM, Random Forest
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score

In [13]:
#Splitting train and test data with 80-20 ratio
X_train, X_test, y_train, y_test = train_test_split(documents_as_array_tfidf, y, test_size=0.2, random_state=0)

In [14]:
#Linear SVM classifier
lsvm_classifier=LinearSVC(random_state=0,penalty='l2')

In [15]:
lsvm_classifier.fit(X_train,y_train)

LinearSVC(random_state=0)

In [16]:
lsvm_y_train_pred=lsvm_classifier.predict(X_train)
lsvm_y_test_pred = lsvm_classifier.predict(X_test)

In [17]:
print("Training accuracy",accuracy_score(y_train, lsvm_y_train_pred))
print("Testing accuracy",accuracy_score(y_test, lsvm_y_test_pred))
print(confusion_matrix(y_test,lsvm_y_test_pred))

Training accuracy 1.0
Testing accuracy 0.8225
[[163  45]
 [ 26 166]]


In [18]:
#Linear SVM classifier with hypertuned parameters
param_grid = {'C': [0.1, 1, 10, 100, 1000]}
lsvm_grid = GridSearchCV(LinearSVC(),param_grid,cv=10,verbose =1)

In [19]:
lsvm_grid.fit(X_train,y_train)

Fitting 10 folds for each of 5 candidates, totalling 50 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:   50.6s finished


GridSearchCV(cv=10, estimator=LinearSVC(),
             param_grid={'C': [0.1, 1, 10, 100, 1000]}, verbose=1)

In [20]:
lsvm_grid_ytrain_pred=lsvm_grid.predict(X_train)
lsvm_grid_ytest_pred=lsvm_grid.predict(X_test)

In [21]:
print(lsvm_grid.best_params_)

{'C': 1}


In [22]:
print("Training accuracy-Grid search",accuracy_score(y_train, lsvm_grid_ytrain_pred))
print("Testing accuracy-Grid search",accuracy_score(y_test, lsvm_grid_ytest_pred))
print(confusion_matrix(y_test,lsvm_grid_ytest_pred))

Training accuracy-Grid search 1.0
Testing accuracy-Grid search 0.8225
[[163  45]
 [ 26 166]]


In [23]:
#Random forest classifier
rndf = RandomForestClassifier(n_estimators=1000, random_state=0)
rndf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

In [24]:
rndf_y_train_pred=rndf.predict(X_train)
rndf_y_test_pred =rndf.predict(X_test)

In [25]:
print("Training accuracy",accuracy_score(y_train, rndf_y_train_pred))
print("Testing accuracy",accuracy_score(y_test, rndf_y_test_pred))
print(confusion_matrix(y_test, rndf_y_test_pred))

Training accuracy 1.0
Testing accuracy 0.815
[[177  31]
 [ 43 149]]


In [26]:
#Random forest classifier with hyper tuned parameters
rndf = RandomForestClassifier()
param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

In [27]:
rndf_grid= GridSearchCV(estimator = rndf, param_grid = param_grid, 
                          cv = 10, n_jobs = -1, verbose = 2,scoring="accuracy")

In [28]:
rndf_grid.fit(X_train,y_train)

Fitting 10 folds for each of 288 candidates, totalling 2880 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:   57.5s
[Parallel(n_jobs=-1)]: Done 349 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 632 tasks      | elapsed:  4.4min
[Parallel(n_jobs=-1)]: Done 997 tasks      | elapsed:  6.7min
[Parallel(n_jobs=-1)]: Done 1442 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done 1969 tasks      | elapsed: 13.6min
[Parallel(n_jobs=-1)]: Done 2576 tasks      | elapsed: 17.6min
[Parallel(n_jobs=-1)]: Done 2880 out of 2880 | elapsed: 20.0min finished


GridSearchCV(cv=10, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'bootstrap': [True], 'max_depth': [80, 90, 100, 110],
                         'max_features': [2, 3], 'min_samples_leaf': [3, 4, 5],
                         'min_samples_split': [8, 10, 12],
                         'n_estimators': [100, 200, 300, 1000]},
             scoring='accuracy', verbose=2)

In [29]:
print(rndf_grid.best_params_)
print(rndf_grid.best_estimator_)

{'bootstrap': True, 'max_depth': 110, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 1000}
RandomForestClassifier(max_depth=110, max_features=3, min_samples_leaf=3,
                       min_samples_split=12, n_estimators=1000)


In [30]:
rndf_grid_ytrain_pred=rndf_grid.predict(X_train)
rndf_grid_ytest_pred=rndf_grid.predict(X_test)

In [31]:
print("Training accuracy-Grid search",accuracy_score(y_train, rndf_grid_ytrain_pred))
print("Testing accuracy-Grid search",accuracy_score(y_test, rndf_grid_ytest_pred))

Training accuracy-Grid search 0.971875
Testing accuracy-Grid search 0.7125


In [32]:
print(confusion_matrix(y_test, rndf_grid_ytest_pred))

[[139  69]
 [ 46 146]]


                               Train accuracy    Test accuracy
Linear SVM                       100                 82.25
Linear SVM(GridSearchCV)         100                 82.25
Random Forest                    100                 81.5   
Random Forest(GridSearchCV)      97.18               71.25

It looks to me all the models are over-fitted; The False Positive being higher than False Negative in the models with hypertuned parameters is concerning. The hyper parameter tuning is not really necessary for linear SVM. Is it because I've missed soemthing?

I tried creating Bag of Word Models with Bigrams; The accuracies were as below:

                               Train accuracy    Test accuracy
Linear SVM                       100                 79.75
Linear SVM(GridSearchCV)         100                 79.75
Random Forest                    100                 77.25  
Random Forest(GridSearchCV)      91.9                61.5

I think I should make the Vectorizer more robust and remove the years/numeric during pre-processing; In addition to the numbers, I'm  not sure if the Vectorizer_feature_names() is a good feature set; I can try Lemmatizing by also passing the Parts-Of-Speech(POS) to get context relevant features. 

But choosing the type of POS, choosing stemmer vs lemmatizer looks like a hard call for now.

In addition, I'm wondering how to check if lemmatization results makes sense to our goal.

This was the first time I came across sklearn.utils.Bunch.