In [1]:
# Importing relevant packages
import os
import string
import re
import numpy as np
import pandas as pd

### Negative Reviews

In [2]:
nfilenames = [] # negative filenames
neg = "neg"

#negative file path
negFilePath = "../../../stanford-movie-review-dataset/train/neg/"
for root, dirs, files in os.walk(negFilePath):
    for f in files: 
        nfilenames.append(f)

### Positive Reviews

In [3]:
pfilenames = [] # positive filenames
pos = "pos"

#positive file path
posFilePath = "../../../stanford-movie-review-dataset/train/pos/"
for root, dirs, files in os.walk(posFilePath):
    for f in files: 
        pfilenames.append(f)

In [4]:
# Check
len(nfilenames), len(pfilenames)

(12500, 12500)

### Creating Data Set

In [5]:
data = pd.DataFrame()

#### File ID

In [6]:
file_id = []

# We only want a 1000 negative reviews for training
for i in range(1000):
    file_id.append(neg+"/"+nfilenames[i])

In [7]:
# We only want a 1000 positive reviews for training
for i in range(1000):
    file_id.append(pos+"/"+pfilenames[i])

In [8]:
# Check
len(file_id)

2000

In [12]:
# Append file ids to data frame
data["file_ID"] = file_id
# Check:
data.head()

Unnamed: 0,file_ID
0,neg/1821_4.txt
1,neg/10402_1.txt
2,neg/1062_4.txt
3,neg/9056_1.txt
4,neg/5392_3.txt


#### Raw Text

In [13]:
raw_text = []

# Negative Reviews
for i in range(1000):
    with open(negFilePath+nfilenames[i]) as f:
        review = f.readlines()
        review = review[0]
        raw_text.append(review)

In [14]:
# Positive reviews
for i in range(1000):
    with open(posFilePath+pfilenames[i]) as f:
        review = f.readlines()
        review = review[0]
        raw_text.append(review)

In [15]:
# Check
len(raw_text)

2000

In [16]:
# Append raw text to data frame
data['raw_text'] = raw_text

# Check:
data.tail()

Unnamed: 0,file_ID,raw_text
1995,pos/1328_10.txt,"Without ""mental anachronism"", this film which ..."
1996,pos/11024_9.txt,This movie is just great. It's entertaining fr...
1997,pos/4065_10.txt,I've seen the original English version on vide...
1998,pos/10852_10.txt,"Hello, I was alanrickmaniac. I'm a Still Crazy..."
1999,pos/7833_8.txt,"In some ways, The Wrath of Kriemhild surpasses..."


#### "Good" Target Column 
###### "True" for good review and "False" for not good review (i.e., bad review) 

In [29]:
good = []

for fid in list(data['file_ID']):
    if fid.split("/")[0] == "pos":
        good.append(True)
    else:
        good.append(False)

In [30]:
len(good)

2000

In [34]:
data['Good'] = good

data.head()

Unnamed: 0,file_ID,raw_text,Good
0,neg/1821_4.txt,Working with one of the best Shakespeare sourc...,False
1,neg/10402_1.txt,"Well...tremors I, the original started off in ...",False
2,neg/1062_4.txt,Ouch! This one was a bit painful to sit throug...,False
3,neg/9056_1.txt,"I've seen some crappy movies in my life, but t...",False
4,neg/5392_3.txt,"""Carriers"" follows the exploits of two guys an...",False


### Bag of Words

**We will use the `CountVectorizer` function to vectorize the raw textual data.**


The `max_df` parameter is used to remove terms that are too frequent. In this case, setting `max_df = 0.50` results in ignoring terms that are present in more than 50% of the documents. <br>
The `min_df` parameter is used to remove terms that are too infrequent. In this case, setting `min_df = 0.01` results in ignoring terms that are present in less than 1% of the documents. <br>
The `stop_words` parameter removes words like 'the', 'that', 'an', etc., that do not add much meaning to a particular piece of textual data. 

In [43]:
from sklearn.feature_extraction.text import CountVectorizer
vec = CountVectorizer(max_df = 0.5, min_df = 0.01, stop_words = 'english')

In [44]:
counts = vec.fit_transform(data['raw_text'])

# bag of words
bow = pd.DataFrame(counts.toarray(), columns = vec.get_feature_names())
bow # checking the bag of words data frame



Unnamed: 0,10,100,11,12,13,15,20,30,40,50,...,yeah,year,years,yes,york,young,younger,zero,zombie,zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1,0,0,0,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1998,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
bow.columns # checking the columns

Index(['10', '100', '11', '12', '13', '15', '20', '30', '40', '50',
       ...
       'yeah', 'year', 'years', 'yes', 'york', 'young', 'younger', 'zero',
       'zombie', 'zombies'],
      dtype='object', length=1644)

In [46]:
# Drop all columns with non-alphabet characters
cols_to_drop = []
for name in bow.columns:
    # If name does not consist of letters
    # drop column from data
    if name.isalpha() == False:
        cols_to_drop.append(name)

In [48]:
bow = bow.drop(cols_to_drop, axis=1)
bow

Unnamed: 0,ability,able,absolute,absolutely,accent,accept,accident,accidentally,according,accurate,...,yeah,year,years,yes,york,young,younger,zero,zombie,zombies
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0,2,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1996,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1998,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [49]:
# Joining two dataframes
data = pd.concat((data, bow), axis = 1)
data

Unnamed: 0,file_ID,raw_text,Good,ability,able,absolute,absolutely,accent,accept,accident,...,yeah,year,years,yes,york,young,younger,zero,zombie,zombies
0,neg/1821_4.txt,Working with one of the best Shakespeare sourc...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,neg/10402_1.txt,"Well...tremors I, the original started off in ...",False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,neg/1062_4.txt,Ouch! This one was a bit painful to sit throug...,False,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,neg/9056_1.txt,"I've seen some crappy movies in my life, but t...",False,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,neg/5392_3.txt,"""Carriers"" follows the exploits of two guys an...",False,1,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,pos/1328_10.txt,"Without ""mental anachronism"", this film which ...",True,0,2,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0
1996,pos/11024_9.txt,This movie is just great. It's entertaining fr...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1997,pos/4065_10.txt,I've seen the original English version on vide...,True,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1998,pos/10852_10.txt,"Hello, I was alanrickmaniac. I'm a Still Crazy...",True,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [50]:
# saving data
data.to_csv("movie-reviews-vectorized-data.csv")