In [1]:
# Importing relevant packages
import os
import string
import re
import numpy as np
import pandas as pd

#### Negative Reviews

In [2]:
nfilenames = [] # negative filenames
review_type = "neg"

#negative file path
negFilePath = "../../../stanford-movie-review-dataset/train/neg/"
for root, dirs, files in os.walk(negFilePath):
    for f in files: 
        nfilenames.append(f)

In [3]:
# Check paths, no. of files
nfilenames[:5], len(nfilenames)

(['1821_4.txt', '10402_1.txt', '1062_4.txt', '9056_1.txt', '5392_3.txt'],
 12500)

#### Positive Reviews

In [4]:
pfilenames = [] # positive filenames
review_type = "pos"

#positive file path
posFilePath = "../../../stanford-movie-review-dataset/train/pos/"
for root, dirs, files in os.walk(posFilePath):
    for f in files: 
        pfilenames.append(f)

In [5]:
# Check paths, no. of files
pfilenames[:5], len(pfilenames)

(['4715_9.txt', '12390_8.txt', '8329_7.txt', '9063_8.txt', '3092_10.txt'],
 12500)

#### DataFrame

In [6]:
# creating new dataframe
data = pd.DataFrame()

##### File ID

In [7]:
file_id = []
review_type = "neg"
# review_type = "pos"
# We only want a 2000 negative/positive reviews for training
for i in range(1000):
    file_id.append(review_type+"/"+nfilenames[i])

In [8]:
# Check
len(file_id), file_id[:5]

(1000,
 ['neg/1821_4.txt',
  'neg/10402_1.txt',
  'neg/1062_4.txt',
  'neg/9056_1.txt',
  'neg/5392_3.txt'])

In [9]:
# review_type = "neg"
review_type = "pos"
# We only want a 2000 negative/positive reviews for training
for i in range(1000):
    file_id.append(review_type+"/"+pfilenames[i])

In [10]:
len(file_id), file_id[-5:-1]

(2000,
 ['pos/1328_10.txt', 'pos/11024_9.txt', 'pos/4065_10.txt', 'pos/10852_10.txt'])

In [11]:
# Appending the file IDs to dataframe
data["file_ID"] = file_id

In [12]:
data.head() # check

Unnamed: 0,file_ID
0,neg/1821_4.txt
1,neg/10402_1.txt
2,neg/1062_4.txt
3,neg/9056_1.txt
4,neg/5392_3.txt


##### Raw Text

In [13]:
raw_text = []

# Negative Reviews
for i in range(1000):
    with open(negFilePath+nfilenames[i]) as f:
        review = f.readlines()
        review = review[0]
        raw_text.append(review)
        
# check
len(raw_text)

1000

In [14]:
# Positive Reviews
for i in range(1000):
    with open(posFilePath+pfilenames[i]) as f:
        review = f.readlines()
        review = review[0]
        raw_text.append(review)
        
# check
len(raw_text)

2000

In [15]:
# Appending raw text to dataframe
data["raw_text"] = raw_text

# check
data.head()

Unnamed: 0,file_ID,raw_text
0,neg/1821_4.txt,Working with one of the best Shakespeare sourc...
1,neg/10402_1.txt,"Well...tremors I, the original started off in ..."
2,neg/1062_4.txt,Ouch! This one was a bit painful to sit throug...
3,neg/9056_1.txt,"I've seen some crappy movies in my life, but t..."
4,neg/5392_3.txt,"""Carriers"" follows the exploits of two guys an..."


##### "is_good" target column

In [16]:
# check whether the raw text came from the positive review folder. 
data['is_good'] = data['file_ID'].str.split('/').str.get(0) == 'pos'
data # check

Unnamed: 0,file_ID,raw_text,is_good
0,neg/1821_4.txt,Working with one of the best Shakespeare sourc...,False
1,neg/10402_1.txt,"Well...tremors I, the original started off in ...",False
2,neg/1062_4.txt,Ouch! This one was a bit painful to sit throug...,False
3,neg/9056_1.txt,"I've seen some crappy movies in my life, but t...",False
4,neg/5392_3.txt,"""Carriers"" follows the exploits of two guys an...",False
...,...,...,...
1995,pos/1328_10.txt,"Without ""mental anachronism"", this film which ...",True
1996,pos/11024_9.txt,This movie is just great. It's entertaining fr...,True
1997,pos/4065_10.txt,I've seen the original English version on vide...,True
1998,pos/10852_10.txt,"Hello, I was alanrickmaniac. I'm a Still Crazy...",True


### Bag of Words

In [17]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(stop_words = 'english')
vec = CountVectorizer(max_df = 0.2, min_df = 30, stop_words = 'english')

vec

CountVectorizer(max_df=0.2, min_df=30, stop_words='english')

In [18]:
counts = vec.fit_transform(data['raw_text'])
# bag of words
bow = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names())
bow



Unnamed: 0,10,100,15,20,30,50,70,80,90,ability,...,written,wrong,wrote,yeah,year,years,yes,york,young,younger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,1,1,0,1,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1998,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [43]:
# Drop all columns with non-alphabet characters

cols_to_drop = []
for name in bow.columns:
    # If name does not consist of letters
    # drop column from data
    if name.isalpha() == False:
        cols_to_drop.append(name)

In [44]:
# drop columns
cols_to_drop

['10', '100', '15', '20', '30', '50', '70', '80', '90']

In [45]:
bow = bow.drop(cols_to_drop, axis=1)
bow

Unnamed: 0,ability,able,absolutely,accent,accidentally,act,acted,action,actor,actors,...,written,wrong,wrote,yeah,year,years,yes,york,young,younger
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,1,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1996,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
1998,0,1,0,0,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,0


In [46]:
# Joining two dataframes

data = pd.concat((data, bow), axis = 1)
data

Unnamed: 0,file_ID,raw_text,is_good,ability,able,absolutely,accent,accidentally,act,acted,...,written,wrong,wrote,yeah,year,years,yes,york,young,younger
0,neg/1821_4.txt,Working with one of the best Shakespeare sourc...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,neg/10402_1.txt,"Well...tremors I, the original started off in ...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,neg/1062_4.txt,Ouch! This one was a bit painful to sit throug...,False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,neg/9056_1.txt,"I've seen some crappy movies in my life, but t...",False,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
4,neg/5392_3.txt,"""Carriers"" follows the exploits of two guys an...",False,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,pos/1328_10.txt,"Without ""mental anachronism"", this film which ...",True,0,2,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1996,pos/11024_9.txt,This movie is just great. It's entertaining fr...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,pos/4065_10.txt,I've seen the original English version on vide...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
1998,pos/10852_10.txt,"Hello, I was alanrickmaniac. I'm a Still Crazy...",True,0,1,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


### Sentiment Analysis with Logistic Regression

In [47]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(data, test_size = 0.3)

X_train = train_data.drop(['file_ID', 'raw_text', 'is_good'], axis = 1)
y_train = train_data['is_good']

X_test = test_data.drop(['file_ID', 'raw_text', 'is_good'], axis = 1)
y_test = test_data['is_good']

In [48]:
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression()
LR.fit(X_train, y_train)
LR.score(X_train, y_train)

0.9964285714285714

In [49]:
from sklearn.model_selection import cross_val_score

cross_val_score(LR, X_train, y_train, cv = 5).mean()

0.7842857142857144

We see that we are overfitting the data above -- so we add in a regularization parameter that controls the complexity of logistic regression!

We will use trial and error to determine a good $\lambda$ value!

In [52]:
for Lambda in np.linspace(0.005, 0.05, 25):
    print(str(np.round(Lambda, 4)), end = ": ")
    LR = LogisticRegression(C = Lambda)
    cv_score = cross_val_score(LR, X_train, y_train, cv = 5).mean()
    print(np.round(cv_score, 3))

0.005: 0.793
0.0069: 0.796
0.0088: 0.799
0.0106: 0.801
0.0125: 0.803
0.0144: 0.804
0.0162: 0.807
0.0181: 0.809
0.02: 0.81
0.0219: 0.81
0.0238: 0.811
0.0256: 0.814
0.0275: 0.814
0.0294: 0.814
0.0312: 0.814
0.0331: 0.814
0.035: 0.814
0.0369: 0.816
0.0388: 0.816
0.0406: 0.816
0.0425: 0.816
0.0444: 0.816
0.0462: 0.816
0.0481: 0.816
0.05: 0.816


In [65]:
# Lets try lambda = 0.05!!

Lambda = 0.05
LR = LogisticRegression(C = Lambda)
LR.fit(X_train, y_train)

# testing our model!
LR.score(X_test, y_test)

0.8316666666666667

So, our simple logistic model is able to correctly identify vs. negative movie reviews about 83% of the time after implementing a regularization parameter of 0.05!

Did our model successfully determine positive and negative words? 

In [66]:
result_df = pd.DataFrame({"coef" : LR.coef_[0], "word" : X_train.columns})
result_df

Unnamed: 0,coef,word
0,-0.072825,ability
1,0.135662,able
2,0.040169,absolutely
3,-0.058536,accent
4,-0.031171,accidentally
...,...,...
1041,0.253032,years
1042,-0.073100,yes
1043,0.008791,york
1044,0.009784,young


In [67]:
# Most negative words
result_df.sort_values('coef', ascending = True).head(10)

Unnamed: 0,coef,word
1028,-0.663908,worst
995,-0.492605,waste
58,-0.432932,awful
85,-0.419547,boring
683,-0.414302,poor
782,-0.403122,script
911,-0.376109,terrible
874,-0.361498,stupid
584,-0.354994,minutes
107,-0.349907,care


In [68]:
# Most positive words
result_df.sort_values('coef', ascending = False).head(10)

Unnamed: 0,coef,word
289,0.496796,excellent
655,0.411074,perfect
542,0.403838,loved
92,0.329121,brilliant
1019,0.323709,wonderful
1025,0.310471,works
270,0.305323,enjoy
1026,0.300488,world
927,0.28398,today
421,0.282976,highly
