In [0]:
#All the necessary imports
import numpy as np
import nltk
import pandas as pd
import os
import re

### This is an example of a movie review.

In [2]:
f = open('./aclImdb/train/pos/10327_7.txt', encoding="utf8")
print(f.read())

FileNotFoundError: ignored

#### Putting the train and test data into a pandas dataframe. It shuffles the data as well.

In [0]:
directory = "./aclImdb" #Make sure you put the data folder in the same directory as this jupyter notebook file
labeledData = {}
for i in ["train", "test"]:
    labeledData[i] = []
    for sentiment in ["pos", "neg"]:
        score = 1 if sentiment == "pos" else 0
        path = os.path.join(directory, i, sentiment)
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), encoding="utf8") as f:
                labeledData[i].append([f.read(), score])  #Initially adds them to separate lists

np.random.shuffle(labeledData["train"]) #Shuffling
labeledData["train"] = pd.DataFrame(labeledData["train"], columns = ['text', 'sentiment']) #Putting them in a dataframe
np.random.shuffle(labeledData["test"])
labeledData["test"] = pd.DataFrame(labeledData["test"], columns = ['text', 'sentiment'])
labeledData["train"], labeledData["test"] #Prints out both pandas dataframes

The first column contains the movie reviews in separated rows.
The second column indicates whether the review is a positive or negative review. 
A positive reivew has 7-10 stars, A negative review has 1-4 stars. 5-6 stars are disregarded

In [0]:
labeledData["train"] 

In [0]:
labeledData["train"]["text"][0]

## Converting the movie reviews into Specific Bag of Words Vectors

In [0]:
#sklearn imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#Stop word list created to be used
stop_words = ['in', 'of', 'at', 'a', 'the', 'and']

#how each review will be vectorized
vectorizer = CountVectorizer(stop_words=stop_words, #These stop words are removed
                             binary=True, # if it contains in list it's 1, else it is 0
                             ngram_range=(1,2) #contains pairs of words as well
                            )

x_train = vectorizer.fit_transform(labeledData["train"]["text"]) #reviews from train dataframe are vectorized
x_test = vectorizer.transform(labeledData["test"]["text"])  #reviews from test dataframe are vectorized





## Creating a model, training, and applying the model

In [0]:
#Any linear model can be used
model = LinearSVC(C=.01)  
#This is another model that can be used
model2 = LogisticRegression()   
#model trained associating review vectors to it sentiment scores 
model.fit(x_train, labeledData["train"]["sentiment"]) 
#applying model on test data creates what model thinks is sentiment scores associated with each movie review
y_pred = model.predict(x_test) 
#accuracy score created by comparing to actual sentiment score to model's predicted sentiment score
acc = accuracy_score(labeledData["test"]["sentiment"], y_pred)
print("Accuracy score of model: "+str(acc))

#This is a the representation of the matrix of all the vectorized movie reviews. 
#The data is very sparse, so this data only shows the position on if a word is contained in a movie review
#The first number in the ordered pair represents the movie review. 
#The second number in the ordered pair represents a particular word.
#The row of 1 shows that it is the word is there in the movie review.
#Everthing else are 0s

In [0]:
print(x_train)
#x_train[0].nonzero()[1]

In [0]:
#This prints out the first 10 n-grams associated with the first review above and the correspoding indeces for each n-gram
count = 10;
for i in x_train[0].nonzero()[1]:
    print(str(i)+": " + vectorizer.get_feature_names()[i])
    count-=1
    if (count == 0):
        break

## Using pipeline to make it easier (puts, vectorizer and model in one line)
Using Tfidf by transforming original countvectorizer using Tfidf

In [0]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
#pipe = Pipeline([('vect', CountVectorizer(binary=True, ngram_range=(1,2))),('clf',LogisticRegression(C=.05))])fg
stop_words = ['in', 'of', 'at', 'a', 'the']

##pipe = Pipeline([('vect', CountVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_words)), ('clf',LinearSVC(C=.01))])##

#Making pipeline. has vectorizer, uses tfidf, and uses multinomialnb for the model
pipe = Pipeline([('vect', CountVectorizer(binary=True, ngram_range=(1,2), stop_words=stop_words)), 
                 ('tfid', TfidfTransformer()), ('clf', MultinomialNB())])

In [0]:
#fits whole pipeline using the train text as x value and train sentiment as y value and makes model
pipe.fit(labeledData["train"]["text"], labeledData["train"]["sentiment"]) 
#model predicts sentiment scores using test dataframe
y_pred = pipe.predict(labeledData["test"]["text"])
#compares model sentiment scores to actual sentiment scores
acc = accuracy_score(labeledData["test"]["sentiment"], y_pred)
print("Accuracy score of the model: "+ str(acc))