# HW3 - Q1 Xianghui Gu (xgu72, 903248583)

### 1. Import data

- Read data from labeledTrainData, testData and unlabeledTrainData
- This blog uses labeledTrainData as training set and testData as testing set for submission
- There are 25000 labeled train reviews, 25000 labeled test reviews, and 50000 unlabeled reviews

In [1]:
import pandas as pd
import numpy as np
# Read data from files 
train = pd.read_csv( "labeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

In [2]:
test = pd.read_csv( "testData.tsv", header=0, delimiter="\t", quoting=3 )

In [3]:
unlabeled_train = pd.read_csv( "unlabeledTrainData.tsv", header=0, 
 delimiter="\t", quoting=3 )

- save the test label for submission use in the near future

In [86]:
train.to_csv('label.csv', index = False, columns = ["sentiment"])

### 2. String data cleaning

- Clean html, non-letter characters, stop words
- lower case all letters
- split words

In [5]:
# Import various modules for string cleaning
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords

def review_to_wordlist( review, remove_stopwords=False ):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.
    #
    # 1. Remove HTML
    ### to avoid warning, emphasize html parser
    review_text = BeautifulSoup(review, "html.parser").get_text()
    #  
    # 2. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    #
    # 3. Convert words to lower case and split them
    words = review_text.lower().split()
    #
    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]
    #
    # 5. Return a list of words
    return(words)

- use nltk english tokenizer to convert a sentence into a list of words in the nltk dictionary

In [6]:
# Download the punkt tokenizer for sentence splitting
import nltk.data
# nltk.download()   

# Load the punkt tokenizer
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

# Define a function to split a review into parsed sentences
def review_to_sentences( review, tokenizer, remove_stopwords=False ):
    # Function to split a review into parsed sentences. Returns a 
    # list of sentences, where each sentence is a list of words
    #
    # 1. Use the NLTK tokenizer to split the paragraph into sentences
    raw_sentences = tokenizer.tokenize(review.strip())
    #
    # 2. Loop over each sentence
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence, \
              remove_stopwords ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

- Here, we use training set (labeled data) only.
- In addition, we removed the stop words for consistency as following methods.
- After breaking down sentences, we obtain 266551 sentences.

In [7]:
sentences = []  # Initialize an empty list of sentences

print ("Parsing sentences from training set")
for review in train["review"]:
    sentences += review_to_sentences(review, tokenizer, True)

# print ("Parsing sentences from unlabeled set")
# for review in unlabeled_train["review"]:
#     sentences += review_to_sentences(review, tokenizer)

Parsing sentences from training set


  'Beautiful Soup.' % markup)
  ' that document to Beautiful Soup.' % decoded_markup


In [8]:
print (len(sentences))

266551


### 3. Word to Vector

In [9]:
# Import the built-in logging module and configure it so that Word2Vec 
# creates nice output messages
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',\
    level=logging.INFO)

- Set parameters

In [10]:
# Set values for various parameters
num_features = 100    # Word vector dimensionality                      
min_word_count = 40   # Minimum word count                        
num_workers = 4       # Number of threads to run in parallel
context = 5          # Context window size                                                                                    
# downsampling = 1e-3   # Downsample setting for frequent words


- Train the word2vec model

In [11]:
# Initialize and train the model (this will take some time)
from gensim.models import word2vec
print ("Training model...")
model = word2vec.Word2Vec(sentences, workers=num_workers, \
            size=num_features, min_count = min_word_count, \
            window = context)


2017-04-17 15:53:15,360 : INFO : 'pattern' package not found; tag filters are not available for English
2017-04-17 15:53:15,367 : INFO : collecting all words and their counts
2017-04-17 15:53:15,368 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2017-04-17 15:53:15,400 : INFO : PROGRESS: at sentence #10000, processed 114931 words, keeping 17627 word types
2017-04-17 15:53:15,430 : INFO : PROGRESS: at sentence #20000, processed 228988 words, keeping 24797 word types
2017-04-17 15:53:15,460 : INFO : PROGRESS: at sentence #30000, processed 339533 words, keeping 29883 word types
2017-04-17 15:53:15,490 : INFO : PROGRESS: at sentence #40000, processed 453983 words, keeping 34196 word types
2017-04-17 15:53:15,517 : INFO : PROGRESS: at sentence #50000, processed 565006 words, keeping 37609 word types
2017-04-17 15:53:15,550 : INFO : PROGRESS: at sentence #60000, processed 676637 words, keeping 40571 word types


Training model...


2017-04-17 15:53:15,581 : INFO : PROGRESS: at sentence #70000, processed 789005 words, keeping 43180 word types
2017-04-17 15:53:15,614 : INFO : PROGRESS: at sentence #80000, processed 899771 words, keeping 45561 word types
2017-04-17 15:53:15,648 : INFO : PROGRESS: at sentence #90000, processed 1013453 words, keeping 47982 word types
2017-04-17 15:53:15,681 : INFO : PROGRESS: at sentence #100000, processed 1125135 words, keeping 50054 word types
2017-04-17 15:53:15,712 : INFO : PROGRESS: at sentence #110000, processed 1236261 words, keeping 51928 word types
2017-04-17 15:53:15,747 : INFO : PROGRESS: at sentence #120000, processed 1348541 words, keeping 53966 word types
2017-04-17 15:53:15,780 : INFO : PROGRESS: at sentence #130000, processed 1461911 words, keeping 55694 word types
2017-04-17 15:53:15,810 : INFO : PROGRESS: at sentence #140000, processed 1568503 words, keeping 57193 word types
2017-04-17 15:53:15,846 : INFO : PROGRESS: at sentence #150000, processed 1682622 words, keep

- Save the word2vec model

In [84]:

# If you don't plan to train the model any further, calling 
# init_sims will make the model much more memory-efficient.
model.init_sims(replace=True)

# It can be helpful to create a meaningful model name and 
# save the model for later use. You can load it later using Word2Vec.load()
model_name = "100features_40minwords_5context_training"
model.save(model_name)

2017-04-17 14:29:33,052 : INFO : precomputing L2-norms of word weight vectors
2017-04-17 14:29:33,129 : INFO : saving Word2Vec object under 100features_40minwords_5context_training, separately None
2017-04-17 14:29:33,130 : INFO : not storing attribute syn0norm
2017-04-17 14:29:33,131 : INFO : not storing attribute cum_table
2017-04-17 14:29:33,231 : INFO : saved 100features_40minwords_5context_training


### 4. Numeric Representations of Words

In [2]:
# Load the model that we created above
from gensim.models import Word2Vec
model = Word2Vec.load("100features_40minwords_5context_training")

- get word_vectors (a matrix of word, word features) with shape: (8160, 100)
- set the num_clusters to 10 for k-means

In [3]:
word_vectors = model.wv.syn0
num_clusters = 10

- fit k-means model using word_vectors as features

In [4]:
from sklearn.cluster import KMeans
import time

start = time.time() # Start time

# Initalize a k-means object and use it to extract centroids
kmeans_clustering = KMeans( n_clusters = num_clusters )
idx = kmeans_clustering.fit_predict( word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print ("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  1.377249002456665 seconds.


- index2word converts the index to real word
- create word_centroid_map to record (word, classification)

In [5]:
# Create a Word / Index dictionary, mapping each vocabulary word to
# a cluster number                                                                                            
word_centroid_map = dict(zip( model.wv.index2word, idx ))

- generate top twenty words closest to its centroid in each cluster

(We made an assumption that the 20 closest points to a centroid of cluster $i$ belongs to cluster $i$. We check if they all belong to cluster $i$ in for loop. If the data violates the assumption, like in the google dataset, we have to extract the points of each cluster first)

In [64]:
for cluster in range(0,10):
    #
    # Print the cluster number  
    print ("\nCluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    # transform the data to distance to cluster i
    dist = kmeans_clustering.transform(word_vectors)[:, cluster]
    # return the indicies the smallest 20 distances
    close_ind = np.argpartition(dist,20)[:20]
    # check assumption made above using word count 
    word_count = 0
    # check the 20 words are clustered in cluster i
    # then append to the words list
    for ind in close_ind:
        word = model.wv.index2word[ind]
        # check if the word belongs to the cluster i
        if word_centroid_map[word] == cluster:
            words.append(word)
            word_count += 1
    if word_count != 20:
        print("Cluster is not well-behaved")
    print (words)
    


Cluster 0
['lang', 'brazilian', 'earliest', 'regarded', 'ealing', 'fifties', 'imho', 'landmark', 'manga', 'masterpieces', 'influential', 'kurosawa', 'sf', 'laputa', 'sergio', 'comparable', 'update', 'bakshi', 'acclaim', 'fellini']

Cluster 1
['ambiance', 'enhance', 'enhanced', 'layered', 'pleasing', 'delicious', 'strained', 'unusually', 'coupled', 'moody', 'understated', 'framing', 'lively', 'tones', 'distinctive', 'maintains', 'mesmerizing', 'precise', 'rendering', 'combining']

Cluster 2
['client', 'protective', 'scheming', 'frankie', 'warden', 'warns', 'creasy', 'avenge', 'salesman', 'threatened', 'convict', 'preacher', 'rescued', 'politician', 'befriends', 'estranged', 'pursued', 'slave', 'ritual', 'buys']

Cluster 3
['keith', 'glenda', 'moss', 'wright', 'gilbert', 'plummer', 'lauren', 'neal', 'burgess', 'miranda', 'gloria', 'downey', 'tyler', 'arkin', 'everett', 'kathleen', 'bauer', 'ian', 'thelma', 'armstrong']

Cluster 4
['sharks', 'racing', 'knocks', 'axe', 'carpet', 'wheel', 

- convert a word list (from one sentence) to #counts in each cluster

In [17]:
def create_bag_of_centroids( wordlist, word_centroid_map ):
    #
    # The number of clusters is equal to the highest cluster index
    # in the word / centroid map
    num_centroids = max( word_centroid_map.values() ) + 1
    #
    # Pre-allocate the bag of centroids vector (for speed)
    bag_of_centroids = np.zeros( num_centroids, dtype="float32" )
    #
    # Loop over the words in the review. If the word is in the vocabulary,
    # find which cluster it belongs to, and increment that cluster count 
    # by one
    for word in wordlist:
        if word in word_centroid_map:
            index = word_centroid_map[word]
            bag_of_centroids[index] += 1
    #
    # Return the "bag of centroids"
    return bag_of_centroids

- clean train and test dataset

In [18]:
clean_train_reviews = []
for review in train["review"]:
    clean_train_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))

clean_test_reviews = []
for review in test["review"]:
    clean_test_reviews.append( review_to_wordlist( review, \
        remove_stopwords=True ))


- save the cleaned version of train and test data 

In [57]:
import csv

In [59]:
with open("clean_train_reviews.csv", 'w') as f:
    csv.writer(f).writerows(clean_train_reviews)

In [60]:
with open("clean_test_reviews.csv", 'w') as f:
    csv.writer(f).writerows(clean_test_reviews)

- create bag of centroids for both train and test dataset

In [21]:
# Pre-allocate an array for the training set bags of centroids (for speed)
train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    train_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1


In [22]:
# Repeat for test reviews 
test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    test_centroids[counter] = create_bag_of_centroids( review, \
        word_centroid_map )
    counter += 1

- save the processed data (# counts in each cluster per record) as X1

In [23]:
np.savetxt('X1.out', train_centroids, delimiter=',') 
np.savetxt('X1_test.out', test_centroids, delimiter=',') 

### 5. Pre-trained data from Google

In [19]:
# Load Google's pre-trained Word2Vec model.
from gensim.models.keyedvectors import KeyedVectors

In [20]:
google_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

- The dataset is (3000000, 300), which means there are 3000000 words and 300 features. 
The size is way too large for simple k-means. Also for comparison with method 1, we use the words intersected with the words in dictionary in method 1.

In [21]:
# get google words as list
google_words = google_model.index2word
# get training model words as set
training_words = set(model.wv.index2word)
numWordsTotal = len(training_words)
numWordsTotal

8160

- After intersecting with the training word list, there are 7810 words left. The loss might come from difference in stop words.

In [22]:
intersection = []
intersection_words = []
for pos, word in enumerate(google_words):
    if word in training_words:
        intersection.append(pos)
        intersection_words.append(word)
    if len(intersection) > numWordsTotal:
        break
len(intersection)

7810

- extract feature vectors of the intersected words

In [23]:
google_word_vectors = google_model.syn0[intersection]

- fit k-means for google dataset

In [40]:
start = time.time() # Start time

# Initalize a k-means object and use it to extract centroids
google_kmeans_clustering = KMeans( n_clusters = num_clusters )
google_idx = google_kmeans_clustering.fit_predict( google_word_vectors )

# Get the end time and print how long the process took
end = time.time()
elapsed = end - start
print ("Time taken for K Means clustering: ", elapsed, "seconds.")

Time taken for K Means clustering:  5.314241886138916 seconds.


In [41]:
google_word_centroid_map = dict(zip(intersection_words, google_idx ))

- obtain top twenty words to its centroid in each cluster

In [52]:
# build word list and index list for each cluster
# the index is w.r.t. the intersection_words
google_cluster_words = [[] for cluster in range(0,10)]
google_cluster_word_indices = [[] for cluster in range(0,10)]
for new_pos, word in enumerate(intersection_words):
    c = google_word_centroid_map[word]
    google_cluster_words[c].append(word)
    google_cluster_word_indices[c].append(new_pos)

In [63]:
for cluster in range(0,10):
    #
    # Print the cluster number  
    print ("\nGoogle Cluster %d" % cluster)
    #
    # Find all of the words for that cluster number, and print them out
    words = []
    # transform the data to distance to cluster i
    dist = google_kmeans_clustering.transform(google_word_vectors)[:, cluster]
    # pick out the distance of the points in cluster i
    dist_in_cluster = dist[np.array(google_cluster_word_indices[cluster],dtype='int64')]
    # return the indicies the smallest 20 distances
    close_ind = np.argpartition(dist_in_cluster,20)[:20]
    # word count set to 0 for checking
    word_count = 0
    # check the 20 words are clustered in cluster i
    # then append to the words list
    for ind in close_ind:
        word = google_cluster_words[cluster][ind]
        if google_word_centroid_map[word] == cluster:
            words.append(word)
            word_count += 1
    if word_count != 20:
        print("Cluster is not well-behaved")
    print (words)


Google Cluster 0
['sense', 'philo', 'great', 'phantasm', 'genuinely', 'life', 'genuine', 'surely', 'brilliant', 'truly', 'moment', 'finally', 'true', 'world', 'happily', 'love', 'glorious', 'undeniably', 'mind', 'inspiring']

Google Cluster 1
['boyer', 'clooney', 'akshay', 'also', 'kidman', 'travolta', 'even', 'sheridan', 'one', 'darth', 'erika', 'actually', 'corbett', 'mgm', 'however', 'though', 'vance', 'although', 'either', 'dalton']

Google Cluster 2
['claus', 'nicholson', 'gunga', 'sutherland', 'jabba', 'vivian', 'downey', 'jodie', 'marcel', 'mccoy', 'kaufman', 'aniston', 'kirsten', 'locke', 'scarface', 'cunningham', 'atlantis', 'vader', 'nora', 'casper']

Google Cluster 3
['man', 'old', 'young', 'people', 'mother', 'family', 'father', 'someone', 'person', 'men', 'woman', 'children', 'teenage', 'cousin', 'husband', 'house', 'son', 'city', 'police', 'parents']

Google Cluster 4
['knows', 'sees', 'comes', 'makes', 'adds', 'thinks', 'stands', 'takes', 'turns', 'puts', 'gets', 'seems

In [46]:
# Pre-allocate an array for the training set bags of centroids (for speed)
google_train_centroids = np.zeros( (train["review"].size, num_clusters), \
    dtype="float32" )

# Transform the training set reviews into bags of centroids
counter = 0
for review in clean_train_reviews:
    google_train_centroids[counter] = create_bag_of_centroids( review, \
        google_word_centroid_map )
    counter += 1

In [47]:
# Repeat for test reviews 
google_test_centroids = np.zeros(( test["review"].size, num_clusters), \
    dtype="float32" )

counter = 0
for review in clean_test_reviews:
    google_test_centroids[counter] = create_bag_of_centroids( review, \
        google_word_centroid_map )
    counter += 1

In [48]:
np.savetxt('X2.out', google_train_centroids, delimiter=',') 
np.savetxt('X2_test.out', google_test_centroids, delimiter=',') 