# Movie Reviews - Setup - AITAMALIK Amine

The purpose of the following code is to extract and split the reviews into a list of words which we will then use to create a vocabulary along with a Bag of Words representation. This will allow us to easily classify the reviews.
We here have 3 types of data:
- The first one is with a 1000 word vocabulary
- The second has a vocabulary of a 2000 words
- The last one has a vocabulary of 1000 words but the words have been stemmed and stopwords are discarded

### Imports

In [1]:
import numpy as np
import os
import collections
from nltk.stem import PorterStemmer

# PART 1 - NORMAL DATA

### Read the Data

In [2]:
PUNCT = "!#$%&()''*+-/.:;?@[]{}|^_`~<>=\"" # all punctuation we discard
TABLE = str.maketrans(PUNCT, " " * len(PUNCT)) # replace punctuation by space

def read_document(filename):
    f = open(filename, encoding="utf-8") # specify encoding to avoid unreadable documents
    text = f.read()
    f.close()
    
    text = text.lower() # all words to lowercase
    text = text.translate(TABLE)
    words = text.split() # separate the document into list of words
    
    return words

#### Test basic word splitting on one file

In [3]:
for f in os.listdir("aclImdb/test/pos"):
    path = "aclImdb/test/pos/" + f
    words = read_document(path)
    
    print(words)
    break

['i', 'went', 'and', 'saw', 'this', 'movie', 'last', 'night', 'after', 'being', 'coaxed', 'to', 'by', 'a', 'few', 'friends', 'of', 'mine', 'i', 'll', 'admit', 'that', 'i', 'was', 'reluctant', 'to', 'see', 'it', 'because', 'from', 'what', 'i', 'knew', 'of', 'ashton', 'kutcher', 'he', 'was', 'only', 'able', 'to', 'do', 'comedy', 'i', 'was', 'wrong', 'kutcher', 'played', 'the', 'character', 'of', 'jake', 'fischer', 'very', 'well,', 'and', 'kevin', 'costner', 'played', 'ben', 'randall', 'with', 'such', 'professionalism', 'the', 'sign', 'of', 'a', 'good', 'movie', 'is', 'that', 'it', 'can', 'toy', 'with', 'our', 'emotions', 'this', 'one', 'did', 'exactly', 'that', 'the', 'entire', 'theater', 'which', 'was', 'sold', 'out', 'was', 'overcome', 'by', 'laughter', 'during', 'the', 'first', 'half', 'of', 'the', 'movie,', 'and', 'were', 'moved', 'to', 'tears', 'during', 'the', 'second', 'half', 'while', 'exiting', 'the', 'theater', 'i', 'not', 'only', 'saw', 'many', 'women', 'in', 'tears,', 'but', 

## I. Build a Vocabulary

In [None]:
vocabulary = collections.Counter()

# positive reviews
for f in os.listdir("aclImdb/smalltrain/pos"):
    path = "aclImdb/smalltrain/pos/" + f
    words = read_document(path)
    
    vocabulary.update(words) # count words and add to vocabulary
    
# negative reviews
for f in os.listdir("aclImdb/smalltrain/neg"):
    path = "aclImdb/smalltrain/neg/" + f
    words = read_document(path)
    
    vocabulary.update(words) # count words and add to vocabulary

# Save vocabulary in "vocabulary.txt" file
f = open("vocabulary.txt", "w", encoding="utf-8")

for word, count in vocabulary.most_common(1000): # 1000 most common words
    print(word, file=f)
f.close()

In [4]:
def load_vocabulary(filename):
    f = open(filename, encoding="utf-8")
    text = f.read()
    f.close()
    words = text.split()
    
    # Create index for each word
    voc = {}
    index = 0
    for word in words:
        voc[word] = index
        index += 1
    
    return voc

In [19]:
vocabulary = load_vocabulary("vocabulary.txt")
list(vocabulary.items())[:5]

[('the', 0), ('a', 1), ('and', 2), ('of', 3), ('to', 4)]

## II. Bag of Words Representation

In [None]:
PUNCT = "!#$%&()''*+-/.:;?@[]{}|^_`~<>=\"" # all punctuation we discard
TABLE = str.maketrans(PUNCT, " " * len(PUNCT)) # replace punctuation by space

def read_document_bow(filename, voc):
    f = open(filename, encoding="utf-8") # specify encoding to avoid unreadable documents
    text = f.read()
    f.close()
    
    text = text.lower() # all words to lowercase
    text = text.translate(TABLE)
    words = text.split() # separate the document into list of words
    
    # Bag of Words
    bow = np.zeros(len(voc))
    for word in words:
        if word in voc:
            index = voc[word]
            bow[index] += 1
    
    return bow

#### Create Train Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/train/pos"):
    path = "aclImdb/train/pos/" + f
    bow = read_document_bow(path, vocabulary)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/train/neg"):
    path = "aclImdb/train/neg/" + f
    bow = read_document_bow(path, vocabulary)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("train.txt.gz", data) # Save into a file / .gz compresses the file

#### Create Test Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/test/pos"):
    path = "aclImdb/test/pos/" + f
    bow = read_document_bow(path, vocabulary)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/test/neg"):
    path = "aclImdb/test/neg/" + f
    bow = read_document_bow(path, vocabulary)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("test.txt.gz", data) # Save into a file / .gz compresses the file

# PART 2 - BIGGER VOCABULARY

In [20]:
bigger_vocabulary = collections.Counter()

# positive reviews
for f in os.listdir("aclImdb/smalltrain/pos"):
    path = "aclImdb/smalltrain/pos/" + f
    words = read_document(path)
    
    bigger_vocabulary.update(words) # count words and add to bigger_vocabulary
    
# negative reviews
for f in os.listdir("aclImdb/smalltrain/neg"):
    path = "aclImdb/smalltrain/neg/" + f
    words = read_document(path)
    
    bigger_vocabulary.update(words) # count words and add to bigger_vocabulary

# Save bigger_vocabulary in "bigger_vocabulary.txt" file
f = open("bigger_vocabulary.txt", "w", encoding="utf-8")

for word, count in bigger_vocabulary.most_common(2000): # 2000 most common words
    print(word, file=f)
f.close()

In [26]:
bigger_vocabulary = load_vocabulary("bigger_vocabulary.txt")
print(f"Length - First Vocabulary: {len(vocabulary)}")
print(f"Length - Bigger vocabulary: {len(bigger_vocabulary)}")

Length - First Vocabulary: 1000
Length - Bigger vocabulary: 2000


## II. Bag of Words Representation

#### Create Train Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/train/pos"):
    path = "aclImdb/train/pos/" + f
    bow = read_document_bow(path, bigger_vocabulary)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/train/neg"):
    path = "aclImdb/train/neg/" + f
    bow = read_document_bow(path, bigger_vocabulary)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("big_voc_train.txt.gz", data) # Save into a file / .gz compresses the file

#### Create Test Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/test/pos"):
    path = "aclImdb/test/pos/" + f
    bow = read_document_bow(path, bigger_vocabulary)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/test/neg"):
    path = "aclImdb/test/neg/" + f
    bow = read_document_bow(path, bigger_vocabulary)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("big_voc_test.txt.gz", data) # Save into a file / .gz compresses the file

# PART 3  - Without Stopwords - Stemmed

In [28]:
os.listdir("aclImdb/")
sw_path = "aclImdb/stopwords.txt"
stopwords = read_document(sw_path)

stopwords[:5]

['a', 'about', 'above', 'across', 'after']

## I. Stem and Discard Stopwords

In [32]:
def read_document_stem_sw(filename):
    words = read_document(filename)
    
    # Removing Stopwords
    words_without_sw = [i for i in words if i not in stopwords]
    
    #Stemming
    ps = PorterStemmer()
    stemmed = [ps.stem(word) for word in words_without_sw]
    
    return stemmed

#### Before

In [30]:
for f in os.listdir("aclImdb/smalltrain/pos"):
    path = "aclImdb/smalltrain/pos/" + f
    words = read_document(path)
    
    print(words)
    break

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life,', 'such', 'as', 'teachers', 'my', '35', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'high', 's', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', 'teachers', 'the', 'scramble', 'to', 'survive', 'financially,', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', 'pomp,', 'the', 'pettiness', 'of', 'the', 'whole', 'situation,', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school,', 'i', 'immediately', 'recalled', 'at', 'high', 'a', 'classic', 'line', 'inspector', 'i', 'm', 'here', 'to', 'sack', 'one', 'of', 'your', 'teachers', 'student', 'welcome', '

#### After

In [33]:
for f in os.listdir("aclImdb/smalltrain/pos"):
    path = "aclImdb/smalltrain/pos/" + f
    words = read_document_stem_sw(path)
    
    print(words)
    break

['bromwel', 'high', 'cartoon', 'comedi', 'ran', 'time', 'program', 'school', 'life,', 'teacher', '35', 'year', 'teach', 'profess', 'lead', 'believ', 'bromwel', 'high', 's', 'satir', 'closer', 'realiti', 'teacher', 'scrambl', 'surviv', 'financially,', 'insight', 'student', 'right', 'pathet', 'teacher', 'pomp,', 'petti', 'situation,', 'remind', 'school', 'knew', 'student', 'saw', 'episod', 'student', 'repeatedli', 'tri', 'burn', 'school,', 'immedi', 'recal', 'high', 'classic', 'line', 'inspector', 'm', 'sack', 'teacher', 'student', 'welcom', 'bromwel', 'high', 'expect', 'adult', 'age', 'think', 'bromwel', 'high', 'far', 'fetch', 'piti', 'isn', 't']


## II. New Vocabulary

In [None]:
vocabulary_stem_sw = collections.Counter()

# positive reviews
for f in os.listdir("aclImdb/smalltrain/pos"):
    path = "aclImdb/smalltrain/pos/" + f
    words = read_document_stem_sw(path)
    
    vocabulary_stem_sw.update(words) # count words and add to vocabulary
    
# negative reviews
for f in os.listdir("aclImdb/smalltrain/neg"):
    path = "aclImdb/smalltrain/neg/" + f
    words = read_document_stem_sw(path)
    
    vocabulary_stem_sw.update(words) # count words and add to vocabulary
    
# Save vocabulary in "vocabulary_stem_sw.txt" file
f = open("vocabulary_stem_sw.txt", "w", encoding="utf-8")

for word, count in vocabulary_stem_sw.most_common(1000): # 1000 most common words
    print(word, file=f)
f.close()

In [35]:
vocabulary_stem_sw = load_vocabulary("vocabulary_stem_sw.txt")
list(vocabulary_stem_sw.items())[:5]

[('br', 0), ('s', 1), ('movi', 2), ('film', 3), ('t', 4)]

## III. Bag of Words - Train and Test Sets 

In [None]:
def read_document_stem_sw_bow(filename, voc):
    words = read_document(filename)
    
    # Removing Stopwords
    words_without_sw = [i for i in words if i not in stopwords]
    
    #Stemming
    stemmed = [ps.stem(word) for word in words_without_sw]
    
    # Bag of Words
    bow = np.zeros(len(voc))
    for word in words:
        if word in voc:
            index = voc[word]
            bow[index] += 1
    
    return bow

#### Train Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/train/pos"):
    path = "aclImdb/train/pos/" + f
    bow = read_document_stem_sw_bow(path, vocabulary_stem_sw)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/train/neg"):
    path = "aclImdb/train/neg/" + f
    bow = read_document_stem_sw_bow(path, vocabulary_stem_sw)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("train_stem_sw.txt.gz", data) # Save into a file / .gz compresses the file

#### Test Set

In [None]:
documents = []
labels = []

# Positive Reviews
for f in os.listdir("aclImdb/test/pos"):
    path = "aclImdb/test/pos/" + f
    bow = read_document_stem_sw_bow(path, vocabulary_stem_sw)
    documents.append(bow)
    labels.append(1)

# Negative Reviews
for f in os.listdir("aclImdb/test/neg"):
    path = "aclImdb/test/neg/" + f
    bow = read_document_stem_sw_bow(path, vocabulary_stem_sw)
    documents.append(bow)
    labels.append(0)

X = np.stack(documents)
Y = np.array(labels)

data = np.concatenate([X, Y[:, None]], 1)

np.savetxt("test_stem_sw.txt.gz", data) # Save into a file / .gz compresses the file