In [1]:
# -*- coding: utf-8 -*- 

import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
train_data = pd.read_csv(r"C:\Users\52054867\machine learning\Word2Vec\word2vec-nlp-tutorial\labeledTrainData.tsv\labeledTrainData.tsv", delimiter="\t", header=0)
train_data.shape

(25000, 3)

In [3]:
test_data = pd.read_csv(r"C:\Users\52054867\machine learning\Word2Vec\word2vec-nlp-tutorial\testData.tsv\testData.tsv", delimiter="\t", header=0)
test_data.shape

(25000, 2)

In [4]:
## clean the data using BeatufulSoap module
import re
import nltk

from bs4 import BeautifulSoup
from nltk.corpus import stopwords

def review_to_wordlist(review):
    
    ##remove html tags
    review_text = BeautifulSoup(review).get_text()
    
    # remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)
    
    ##convert word to lower case and split them
    words = review_text.lower().split()
    
    ## make a set of stopwords as set is much faster than a list. filter all the stopwords
    stops = set(stopwords.words("english"))
    words = [w for w in words if not w in stops ]
    
    return words
    
    

In [5]:
clean_train_reviews = []

for i in range ( 0, len(train_data["review"]) ):
    clean_train_reviews.append( " ".join(review_to_wordlist(train_data["review"][i])))

In [6]:
clean_train_reviews[0]

u'stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate worki

CountVectorization
The Bag of Words model learns a vocabulary from all of the documents, then models each document by counting the number of times each word appears. For example, consider the following two sentences:

Sentence 1: "The cat sat on the hat"

Sentence 2: "The dog ate the cat and the hat"

From these two sentences, our vocabulary is as follows:

{ the, cat, sat, on, hat, dog, ate, and }

To get our bags of words, we count the number of times each word occurs in each sentence. In Sentence 1, "the" appears twice, and "cat", "sat", "on", and "hat" each appear once, so the feature vector for Sentence 1 is:

{ the, cat, sat, on, hat, dog, ate, and }

Sentence 1: { 2, 1, 1, 1, 1, 0, 0, 0 }

Similarly, the features for Sentence 2 are: { 3, 1, 0, 0, 1, 1, 1, 1}

In the IMDB data, we have a very large number of reviews, which will give us a large vocabulary. To limit the size of the feature vectors, we should choose some maximum vocabulary size. Below, we use the 5000 most frequent words (remembering that stop words have already been removed).

In [7]:
cv = CountVectorizer(analyzer = "word",   \
                         tokenizer = None,    \
                         preprocessor = None, \
                         stop_words = None,   \
                         max_features = 5000)

train_data_features = cv.fit_transform(clean_train_reviews)

# Numpy arrays are easy to work with, so convert the result to an
# array

np.asarray(train_data_features)

array(<25000x5000 sparse matrix of type '<type 'numpy.int64'>'
	with 1975006 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [8]:
# train with random forest
rf = RandomForestClassifier(n_estimators=100)

forest = rf.fit(train_data_features,train_data["sentiment"])

In [9]:
clean_test_review = []

for i in range ( 0, len(test_data["review"]) ):
    clean_test_review.append( " ".join(review_to_wordlist(test_data["review"][i])))
    

In [10]:
test_data_features = cv.fit_transform(clean_test_review)

np.asarray(test_data_features)

result = forest.predict(test_data_features)

#from sklearn.metrics import confusion_matrix
#confusion_matrix()



In [11]:
print test_data['review'][12]
print result[12]

print test_data['review'][1]
print result[1]

\The Love Letter\" is one of those movies that could have been really clever, but they wasted it. Focusing on a letter wreaking havoc in a small town, the movie has an all-star cast with nothing to do. Tom Selleck and Alice Drummond had so recently co-starred in the super-hilarious \"In & Out\" (also about an upset in a small town), in which they were both great, but here they look as though they're getting drug all over the place. I can't tell what the people behind the camera are trying to do here (if anything), but they sure didn't accomplish anything. How tragic, that a potential laugh riot got so sorrowfully wasted."
0
This movie is a disaster within a disaster film. It is full of great action scenes, which are only meaningful if you throw away all sense of reality. Let's see, word to the wise, lava burns you; steam burns you. You can't stand next to lava. Diverting a minor lava flow is difficult, let alone a significant one. Scares me to think that some might actually believe wha

TF-IDF Vectorization
If we are working with movie reviews, the word “movie” will be frequent but not useful. If we were working with email data, on the other hand, the word “movie” may not be frequent and would be useful.

The simplest way to account for these overrepresented words is to divide word count by the proportion of text documents each word appeared in. For example, the document:

“I loved this movie! It was great, great, great.”

contains the word “loved” and “movie” once each. Now, let’s suppose that we look at all the other documents and find that, in total, “loved” appears in 1% of text documents and “movie” appears in 33%. We could now weight our scores as

“loved” = times it appears in text / proportion of texts it appears in = 1 / 1% “movie” = times it appears in text / proportion of texts it appears in = 1 / 33%

Before applying weights, both “loved” and “movie” had a score of 1 (since each word appeared in the sentence once). After we apply weights, “loved” has a score of 100 and “movie” has a score of 3. The score for “loved” is much higher relative to “movie”, indicating that we care about the word “loved” much more than “movie”.

In fact, our score for “loved” is now 33 times larger than our score for “movie”. While we suspect that “movie” should be less important than “loved” for predicting whether a review is positive or negative, this relative difference might be too big. Very rare words — perhaps, misspelled words — will receive too much relative weight in our current weighting scheme.

We need to strike a balance between downweighting very frequent words without overweighting rare words. This is what term frequency–inverse document frequency (tf-idf) weighting does for us. In the simple weighting scheme, we used the formula:

times a word appears in text * (1 / proportion of texts it appears in)

tf-idf weighting alters this formula slightly by taking the log of the second term:

times a word appears in text * log(1 / proportion of texts it appears in)

By taking the log, we ensure that our weight changes slowly in relation to how frequently a word appears in all our documents. This means that while common words are downweighted, they aren’t downweighted too much.



To read more about tf-idf see this medium post: https://medium.com/civis-analytics/an-intro-to-natural-language-processing-in-python-framing-text-classification-in-familiar-terms-33778d1aa3ca

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('count', cv),('tfidf',TfidfTransformer())]).fit(clean_train_reviews)

train_data_features = pipeline.transform(clean_train_reviews)

In [13]:
train_data_features.shape

(25000, 5000)

In [14]:
np.asarray(train_data_features)

array(<25000x5000 sparse matrix of type '<type 'numpy.float64'>'
	with 1975006 stored elements in Compressed Sparse Row format>,
      dtype=object)

In [15]:
forest = rf.fit(train_data_features,train_data["sentiment"])

In [16]:
print test_data['review'][12]
print result[12]

print test_data['review'][78]
print result[78]

\The Love Letter\" is one of those movies that could have been really clever, but they wasted it. Focusing on a letter wreaking havoc in a small town, the movie has an all-star cast with nothing to do. Tom Selleck and Alice Drummond had so recently co-starred in the super-hilarious \"In & Out\" (also about an upset in a small town), in which they were both great, but here they look as though they're getting drug all over the place. I can't tell what the people behind the camera are trying to do here (if anything), but they sure didn't accomplish anything. How tragic, that a potential laugh riot got so sorrowfully wasted."
0
I come from Bangladesh, and here, C.C.Costigan is a goddess of awesome sex. All kidding aside, a friend and I were awake in the middle of the night, watching movies on the Encore: Action channel, when we came across a series of sci-fi-esquire flicks. There was RoboCop 2 (not bad,...not bad at all) ... then Judge Dredd, (Stalone almost ruins his career) then a movie 

Part 2
Distributed Word Vectors
Word2vec, published by Google in 2013, is a neural network implementation that learns distributed representations for words. Word2Vec does not need labels in order to create meaningful representations. This is useful, since most data in the real world is unlabeled. If the network is given enough training data (tens of billions of words), it produces word vectors with intriguing characteristics. Words with similar meanings appear in clusters, and clusters are spaced such that some word relationships, such as analogies, can be reproduced using vector math.

Using word vectors

First, we read in the data with pandas, as we did above. But, we now use unlabeledTrain.tsv, which contains 50,000 additional reviews with no labels. When we built the Bag of Words model as above, extra unlabeled training reviews were not useful. However, since Word2Vec can learn from unlabeled data, these extra 50,000 reviews can now be used.

In [17]:
from nltk.corpus import stopwords
import nltk.data
from gensim.models import Word2Vec
from sklearn.ensemble import RandomForestClassifier

In [18]:
unlabeled_train_data = pd.read_csv(r"C:\Users\52054867\machine learning\Word2Vec\word2vec-nlp-tutorial\unlabeledTrainData.tsv\unlabeledTrainData.tsv", delimiter="\t", header=0, quoting=3)
#train_data =           pd.read_csv(r"C:\Users\52054867\machine learning\Word2Vec\word2vec-nlp-tutorial\labeledTrainData.tsv\labeledTrainData.tsv", delimiter="\t", header=0)
unlabeled_train_data.shape

(50000, 2)

In [19]:
unlabeled_train_data.columns

Index([u'id', u'review'], dtype='object')

In [20]:
unlabeled_train_data['review'][0]

'"Watching Time Chasers, it obvious that it was made by a bunch of friends. Maybe they were sitting around one day in film school and said, \\"Hey, let\'s pool our money together and make a really bad movie!\\" Or something like that. What ever they said, they still ended up making a really bad movie--dull story, bad script, lame acting, poor cinematography, bottom of the barrel stock music, etc. All corners were cut, except the one that would have prevented this film\'s release. Life\'s like that."'

In [21]:
from nltk.tokenize import sent_tokenize

## break into sentences
def review_to_sentences(review, tokenizer):
    raw_sentences = tokenizer.tokenize(review.strip())
    sentences = []
    for raw_sentence in raw_sentences:
        # If a sentence is empty, skip it
        if len(raw_sentence) > 0:
            # Otherwise, call review_to_wordlist to get a list of words
            sentences.append( review_to_wordlist( raw_sentence ))
    #
    # Return the list of sentences (each sentence is a list of words,
    # so this returns a list of lists
    return sentences

In [22]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\52054867\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

sentences = []

for review in unlabeled_train_data["review"]:
    sentences += review_to_sentences(unicode(review),tokenizer)

In [24]:
len(sentences)

528987

In [25]:
sentences[528986]

[u'pathmark', u'means', u'savings']

In [26]:
num_features = 300    # Word vector dimensionality
min_word_count = 40   # Minimum word count
num_workers = 4       # Number of threads to run in parallel
context = 10          # Context window size
downsampling = 1e-3   # Downsample setting for frequent words

# Initialize and train the model (this will take some time)
print ("Training Word2Vec model...")
model = Word2Vec(sentences, workers=num_workers, \
                 size=num_features, \
                 min_count = min_word_count,\
                 window = context, \
                 sample = downsampling,\
                 seed=1)



In [27]:
model.save("word2vec_imdb_movie_review")

In [28]:
model = Word2Vec.load("word2vec_imdb_movie_review")

In [29]:
model.wv.vectors.shape

(12907L, 300L)

In [53]:
clean_train_reviews = []
for review in train_data["review"]:
    clean_train_reviews.append( review_to_wordlist( review))

In [54]:
len(clean_train_reviews)

25000

In [55]:
def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    index2word_set = set(model.wv.index2word)

    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec
            featureVec = np.add(featureVec,model.wv.__getitem__([word]))

    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec


def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    
    # 
    # Loop through the reviews
    for review in reviews:
        #
        # Print a status message every 1000th review
        if counter%1000. == 0.:
            print counter
        # 
        # Call the function (defined above) that makes average feature vectors
        reviewFeatureVecs[counter] = makeFeatureVec(review, model, num_features)
        #
        # Increment the counter
        counter = counter + 1
    return reviewFeatureVecs


trainDataVecs = getAvgFeatureVecs( clean_train_reviews, model, num_features )

In [56]:
trainDataVecs.shape

(25000L, 300L)

In [57]:
len(clean_train_reviews)

25000

In [58]:
trainDataVecs.dtype
trainDataVecs_new = np.nan_to_num(trainDataVecs.astype(np.float64))
trainDataVecs_new.dtype

dtype('float64')

In [59]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
forest = rf.fit(trainDataVecs_new,train_data['sentiment'])

In [60]:
## convert test data to vetors
clean_test_reviews = []
for review in (test_data["review"][0], test_data["review"][1]):
    clean_test_reviews.append( review_to_wordlist( review ))

testDataVecs = getAvgFeatureVecs( clean_test_reviews, model, num_features )

# Test & extract results 
result = forest.predict( testDataVecs )

In [70]:
print test_data["review"][0]
print result[0]
print test_data["review"][1]
print result[1]