In [1]:
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from numpy import exp, array, random, dot
from sklearn.model_selection import train_test_split
from math import ceil
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from typing import List, Dict
import nltk
from nltk.corpus import stopwords
from nltk.tokenize.punkt import PunktSentenceTokenizer
from sklearn.cluster import KMeans
import time

np.random.seed(1)

In [2]:
# Download stopwrod text dataset
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Load data

The labeled data set consists of 50,000 IMDB movie reviews, specially selected for sentiment analysis. The sentiment of reviews is binary, meaning the IMDB rating < 5 results in a sentiment score of 0, and rating >=7 have a sentiment score of 1. No individual movie has more than 30 reviews. The 25,000 review labeled training set does not include any of the same movies as the 25,000 review test set. In addition, there are another 50,000 IMDB reviews provided without any rating labels.


In [3]:
train = pd.read_csv("../data/word2vec-nlp/labeledTrainData.tsv", delimiter="\t")
train_unlabeled = pd.read_csv("../data/word2vec-nlp/unlabeledTrainData.tsv", header=0, delimiter="\t", quoting=3)
test = pd.read_csv("../data/word2vec-nlp/testData.tsv", header=0, delimiter="\t", quoting=3)
print(f"Size of train dataset: {train.shape}")
print(f"Size of unlabeled train dataset: {train_unlabeled.shape}")
print(f"Size of test dataset: {test.shape}")

Size of train dataset: (25000, 3)
Size of unlabeled train dataset: (50000, 2)
Size of test dataset: (25000, 2)


In [4]:
train.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [5]:
train["review"][0]

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

## Preprocess data

Data preprocessing includes the following steps:
* Remove HTML tags
* Remove non-letter symbols        
* Convert to lower case, split into individual words
* Remove stop words with the help of nltk library


In [6]:
def review_to_wordlist(raw_review: str, remove_stopwords: bool = False) -> str:
    """Converts a raw review to a processed string of words."""

    review_text = BeautifulSoup(raw_review).get_text() 
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    words = letters_only.lower().split()
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    return words   

In [7]:
clean_review = review_to_wordlist(train["review"][0])
print(clean_review)

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again', 'maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', 'moonwalker', 'is', 'part', 'biography', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', 'some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'mj', 's', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'is', 'all', 'about', 

In [8]:
num_reviews = train["review"].size
clean_train_reviews = []

for i in range(0, num_reviews):
    if((i + 1) % 5000 == 0):
        print(f"Review {i+1} of {num_reviews}")
    clean_train_reviews.append(review_to_wordlist(train["review"][i]))
print("All reviews have been cleaned out.")



Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Review 25000 of 25000
All reviews have been cleaned out.


In [9]:
# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [10]:
# Define a function to split a review into parsed sentences
def review_to_sentences(review: str, tokenizer: PunktSentenceTokenizer, 
                        remove_stopwords: bool = False) -> List[str]:
    """
    Splits a review into parsed sentences. Returns a list of sentences, 
    where each sentence is a list of words.
    """
    
    # split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    
    sentences = []
    for raw_sentence in raw_sentences:
        if len(raw_sentence) > 0:
            sentences.append(review_to_wordlist(raw_sentence, remove_stopwords))

    return sentences

In [11]:
sentences = []  # Initialize an empty list of sentences

print("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer)

print("Parsing sentences from unlabeled set")
for review in train_unlabeled["review"]:
    sentences += review_to_sentences(review, tokenizer)


Parsing sentences from training set




Parsing sentences from unlabeled set


In [12]:
print(sentences[0])

['with', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'mj', 'i', 've', 'started', 'listening', 'to', 'his', 'music', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watched', 'the', 'wiz', 'and', 'watched', 'moonwalker', 'again']


In [13]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

# Set values for various parameters
num_features = 300    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size                                                                                    
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print("Training model...")
model = word2vec.Word2Vec(
    sentences, workers=num_workers, vector_size=num_features, 
    min_count=min_word_count, window=context, sample=downsampling)

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# save the model, it can loaded later using Word2Vec.load()
model_name = "300features_40minwords_10context"
model.save(model_name)

2022-10-01 20:01:36,808 : INFO : collecting all words and their counts
2022-10-01 20:01:36,809 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-10-01 20:01:36,832 : INFO : PROGRESS: at sentence #10000, processed 225803 words, keeping 17776 word types
2022-10-01 20:01:36,858 : INFO : PROGRESS: at sentence #20000, processed 451842 words, keeping 24946 word types
2022-10-01 20:01:36,882 : INFO : PROGRESS: at sentence #30000, processed 671054 words, keeping 30029 word types


Training model...


2022-10-01 20:01:36,907 : INFO : PROGRESS: at sentence #40000, processed 897625 words, keeping 34348 word types
2022-10-01 20:01:36,932 : INFO : PROGRESS: at sentence #50000, processed 1120159 words, keeping 37805 word types
2022-10-01 20:01:36,957 : INFO : PROGRESS: at sentence #60000, processed 1340914 words, keeping 40769 word types
2022-10-01 20:01:36,978 : INFO : PROGRESS: at sentence #70000, processed 1564764 words, keeping 43362 word types
2022-10-01 20:01:37,010 : INFO : PROGRESS: at sentence #80000, processed 1784083 words, keeping 45745 word types
2022-10-01 20:01:37,036 : INFO : PROGRESS: at sentence #90000, processed 2007591 words, keeping 48167 word types
2022-10-01 20:01:37,059 : INFO : PROGRESS: at sentence #100000, processed 2228932 words, keeping 50220 word types
2022-10-01 20:01:37,086 : INFO : PROGRESS: at sentence #110000, processed 2449108 words, keeping 52105 word types
2022-10-01 20:01:37,114 : INFO : PROGRESS: at sentence #120000, processed 2671053 words, keepin

2022-10-01 20:01:38,762 : INFO : PROGRESS: at sentence #760000, processed 17015183 words, keeping 121044 word types
2022-10-01 20:01:38,791 : INFO : PROGRESS: at sentence #770000, processed 17244246 words, keeping 121775 word types
2022-10-01 20:01:38,821 : INFO : PROGRESS: at sentence #780000, processed 17473909 words, keeping 122466 word types
2022-10-01 20:01:38,852 : INFO : PROGRESS: at sentence #790000, processed 17701995 words, keeping 123164 word types
2022-10-01 20:01:38,865 : INFO : collected 123504 word types from a corpus of 17798269 raw words and 794335 sentences
2022-10-01 20:01:38,866 : INFO : Creating a fresh vocabulary
2022-10-01 20:01:38,917 : INFO : Word2Vec lifecycle event {'msg': 'effective_min_count=40 retains 16490 unique words (13.35% of original 123504, drops 107014)', 'datetime': '2022-10-01T20:01:38.917141', 'gensim': '4.2.0', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 

2022-10-01 20:02:16,071 : INFO : Word2Vec lifecycle event {'fname_or_handle': '300features_40minwords_10context', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2022-10-01T20:02:16.071527', 'gensim': '4.2.0', 'python': '3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22000-SP0', 'event': 'saving'}
2022-10-01 20:02:16,071 : INFO : not storing attribute cum_table
2022-10-01 20:02:16,100 : INFO : saved 300features_40minwords_10context


In [14]:
print(model.wv.most_similar("man"))

[('woman', 0.6104881763458252), ('lad', 0.5949812531471252), ('lady', 0.5708168745040894), ('millionaire', 0.5419608354568481), ('farmer', 0.5168347954750061), ('monk', 0.5143375396728516), ('guy', 0.5069648623466492), ('person', 0.5006126761436462), ('men', 0.4968082010746002), ('soldier', 0.49505871534347534)]


In [15]:
def get_feature_vector(words: List[str], model: word2vec.Word2Vec, num_features: int) -> np.array:
    """Averages all of the word vectors in a given paragraph."""
    
    # Pre-initialize an empty numpy array (for speed)
    feature_vector = np.zeros((num_features,), dtype="float32")
    nwords = 0.

    # index_to_key is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index_to_key)
    
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    feature_vector = np.divide(feature_vector, nwords)
    
    return feature_vector


def get_avg_feature_vectors(reviews: List[List[str]], model: word2vec.Word2Vec, num_features: int) -> np.array:
    """Calculates the average feature vector for each review and returns a 2D numpy array."""

    counter = 0
    review_feature_vectors = np.zeros((len(reviews), num_features), dtype="float32")

    for review in reviews:
        if counter % 5000 == 0:
            print("Review %d of %d" % (counter, len(reviews)))
        review_feature_vectors[counter] = get_feature_vector(review, model, num_features)
        counter = counter + 1
    
    return review_feature_vectors

In [16]:
# Calculate average feature vectors for training and testing sets, use stop word removal

clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append(review_to_wordlist(review, remove_stopwords=True))

trainDataVecs = get_avg_feature_vectors(clean_train_reviews, model, num_features)

print("Creating average feature vecs for test reviews")
clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append(review_to_wordlist( review, remove_stopwords=True))

test_data_vectors = get_avg_feature_vectors(clean_test_reviews, model, num_features)



Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000
Creating average feature vecs for test reviews
Review 0 of 25000
Review 5000 of 25000
Review 10000 of 25000
Review 15000 of 25000
Review 20000 of 25000


## Build a model


Let's use Random Forest classifier with 100 trees:

In [17]:
forest = RandomForestClassifier(n_estimators = 100)

print("Fitting a random forest to labeled training data...")
forest = forest.fit(trainDataVecs, train["sentiment"])

Fitting a random forest to labeled training data...


## Submit the prediction

In [18]:
result = forest.predict(test_data_vectors)

submission = pd.DataFrame(data={"id":test["id"], "sentiment":result})
submission.to_csv("../data/word2vec-nlp/Word2Vec_AverageVectors.csv", index=False, quoting=3)
submission.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1


## Build a model (Clustering)


In [19]:
start = time.time()

# Set "k" (num_clusters) to be 1/5th of the vocabulary size, or an
# average of 5 words per cluster
word_vectors = model.wv.vectors
num_clusters = word_vectors.shape[0] // 5

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans(n_clusters=num_clusters)
idx = kmeans_clustering.fit_predict(word_vectors)

print(f"K Means clustering took {time.time() - start:.2f} seconds.")

K Means clustering took 938.73 seconds.


In [20]:
# map each vocabulary word to a cluster number
word_centroid_map = dict(zip(model.wv.index_to_key, idx))

In [21]:
# For the first 10 clusters
for cluster in range(0,10):
    print(f"\nCluster {cluster}")
    words = []
    for i in range(0, len(word_centroid_map.values())):
        if list(word_centroid_map.values())[i] == cluster:
            words.append(list(word_centroid_map.keys())[i])
    print(words)


Cluster 0
['fulfilled', 'purity']

Cluster 1
['fields', 'shanghai', 'buffalo', 'stagecoach', 'ka', 'monument']

Cluster 2
['fatal', 'drowning', 'unconscious', 'suicidal', 'casually', 'paralyzed', 'comatose', 'poisoning', 'katrina', 'gutted']

Cluster 3
['bateman']

Cluster 4
['vacuous', 'mismatched', 'colorless']

Cluster 5
['background', 'bands', 'instruments']

Cluster 6
['nielsen', 'caron', 'comedienne']

Cluster 7
['julian', 'noah', 'domino', 'shane', 'milo', 'einstein', 'meadows', 'reyes']

Cluster 8
['drive', 'driving', 'drives', 'passes', 'drove']

Cluster 9
['caught', 'picked', 'signed']


In [22]:
def create_bag_of_centroids(wordlist: List[str], word_centroid_map: Dict[str, int]) -> np.array:
    """
    Loop over the words in the review. If the word is in the vocabulary,
    find which cluster it belongs to, and increment that cluster count by one.
    """
    
    num_centroids = max(word_centroid_map.values()) + 1
    bag_of_centroids = np.zeros(num_centroids, dtype="float32")

    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1

    return bag_of_centroids

In [23]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros((train["review"].size, num_clusters), dtype="float32")

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

# Repeat for test reviews 
test_centroids = np.zeros((test["review"].size, num_clusters), dtype="float32")

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids(review, word_centroid_map)
    counter += 1

## Submit the prediction (Clustering)

In [24]:
forest = RandomForestClassifier(n_estimators = 100)

# Fitting the forest may take a few minutes
print("Fitting a random forest to labeled training data...")
forest = forest.fit(train_centroids,train["sentiment"])

Fitting a random forest to labeled training data...


In [25]:
result = forest.predict(test_centroids)

submission_clustering = pd.DataFrame(data={"id": test["id"], "sentiment": result})
submission_clustering.to_csv("../data/word2vec-nlp/BagOfCentroids.csv", index=False, quoting=3)
submission_clustering.head()

Unnamed: 0,id,sentiment
0,"""12311_10""",1
1,"""8348_2""",0
2,"""5828_4""",1
3,"""7186_2""",0
4,"""12128_7""",1
