## Import the libraries

In [70]:
import numpy as np
import re
import pickle 
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import load_files as lf
import pandas as pd

#### we have used load files as the dataset contains folders

In [74]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\ananyya
[nltk_data]     srivastava\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### load files will loop through dataset neg and pos and will generate two classes 0 and 1 for neg and pos

In [72]:
dataset = lf(r'C:\Users\ananyya srivastava\Desktop\txt_sentoken')

#### now we need to seperate documents and classes in a seperate list

In [75]:
x,y = dataset.data,dataset.target


#### now documents and classes are seperated into x and y lists
0-neg
1-pos


#### incase of imdb large dataset load files will take 15 minutes to load the dataset so to get away with that we will store x and y as pickle file

#### w-write b-byte

In [76]:
with open('x.pickle','wb') as f:
    pickle.dump(X,f)
    
with open('y.pickle','wb') as f:
    pickle.dump(y,f)

In [77]:
# Unpickling dataset
X_i = open('x.pickle','rb')
y_i = open('y.pickle','rb')
x = pickle.load(X_i)
y = pickle.load(y_i)

#### this will take less time if we use pickle method only few  seconds for 2000

#### preprocess the dataset
remove all the non word characters ascii symbols , punchuations etc using W
remove all the single characters which have space before and after 
remove and all the single character which is the start of the sentences
remove extra spaces



In [78]:

list_doc=[]
for i in range(0,len(x)):
    dataset = re.sub(r'\W', ' ', str(x[i]))
    dataset = dataset.lower()
    dataset = re.sub(r'^br$', ' ', dataset)
    dataset = re.sub(r'\s+br\s+',' ',dataset)
    dataset = re.sub(r'\s+[a-z]\s+', ' ',dataset)
    dataset = re.sub(r'^b\s+', '', dataset)
    dataset = re.sub(r'\s+', ' ', dataset)
    list_doc.append(dataset)
    

In [87]:
# Creating the BOW model
from sklearn.feature_extraction.text import CountVectorizer as cv
bag = cv(max_features = 2000, min_df = 1, max_df = 0.6, stop_words = stopwords.words('english'))
x= bag.fit_transform(list_doc).toarray()

#### cv will create simple binary bag of words model

#### max_features=2000 most frequent words in our dataset
min document frequency it will exclude the words which appears in 1 document
max_df=exclude all the words which appear in 60% of documents (the,that) thus we can focus only on the imp words
remove all the stopwords in the list of stop words

In [122]:
# Creating the Tf-Idf Model
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
x = transformer.fit_transform(X).toarray()

In [123]:
x

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.04815829, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.14153766, 0.        , 0.07844951, ..., 0.        , 0.        ,
        0.        ]])

#### now x contains fractional values rather than 0 and 1


In [124]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split as tt
x_train, x_test, y_train, y_test = tt(x, y, test_size = 0.20, random_state = 0)


In [125]:
# Training the classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(x_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [126]:
 #Testing model performance
sent_pred = classifier.predict(x_test)


from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, sent_pred)

In [127]:
sent_pred

array([0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1,
       1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1,
       0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1,

In [128]:
y_test.shape

(400,)

In [129]:
x_test.shape

(400, 2000)

In [130]:
y_train.shape

(1600,)

In [131]:
x_train.shape

(1600, 2000)

In [135]:
from sklearn.metrics import accuracy_score as accu

In [136]:
accu(sent_test,sent_pred)

0.8275

In [137]:
cm

array([[166,  42],
       [ 27, 165]], dtype=int64)

In [156]:
# Saving our classifier
with open('classifier.pickle','wb') as f:
    pickle.dump(classifier,f)
    
# Saving the Tf-Idf model
with open('tfidfmodel.pickle','wb') as f:
    pickle.dump(vectorizer,f)
