In [2]:
import json
import numpy as np
import struct
from tabulate import tabulate

import matplotlib.pyplot as plt
from nltk.tokenize import word_tokenize

import numpy as np
import seaborn as sns
import pandas as pd
from gensim.models.word2vec import Word2Vec
from collections import Counter, defaultdict
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import StratifiedShuffleSplit



In [3]:
GLOVE_6B_300D_PATH = "D:/1.001/project/glove.6B/glove.6B.300d.txt"
encoding = "utf-8"
filename = "D:/1.001/project/neptune_oyster_test_data.json"
X,y = [],[]
with open(filename,"r")as f:
    set_data = json.load(f)
    for i in set_data:
        review = []
        for item in word_tokenize(i["review_detail"]):
            if item.isalpha(): # no need for stopword, clean not alpha, lower()
                review.append(item.lower())
        X.append(review)
        y.append(i["review_rating"])
X, y = np.array(X), np.array(y)
print("total examples %s" % len(y))   # training dataset with label"
print("y",y[:20])

total examples 3665
y ['5.0' '5.0' '5.0' '4.0' '5.0' '4.0' '5.0' '5.0' '5.0' '4.0' '4.0' '4.0'
 '5.0' '5.0' '4.0' '5.0' '5.0' '4.0' '5.0' '5.0']


In [4]:
glove_small = {}
all_words = set()
for item in X:
    for words in item:
        all_words.add(words)
#"set of tokenized word", only pick words in the train set
with open(GLOVE_6B_300D_PATH,"rb") as f:
    for line in f:
        parts = line.split()
        word = parts[0].decode(encoding)
        if (word in all_words):
            nums=np.array(parts[1:], dtype=np.float32)
            glove_small[word] = nums 

In [5]:
class MeanEmbeddingVectorizer(object):
    '''
     a word -> vector mapping and vectorizes texts by taking 
     the mean of all the vectors corresponding to individual 
     words
    '''
    def __init__(self, word2vec):
        self.word2vec = word2vec
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
            
    def fit(self, X, y):
        return self 

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec] 
                    or [np.zeros(self.dim)], axis=0)
            for words in X])

    
# and a tf-idf version of the same
class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        if len(word2vec)>0:
            self.dim=len(word2vec[next(iter(glove_small))])
        else:
            self.dim=0
        
    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        # if a word was never seen - it must be at least as infrequent
        # as any of the known words - so the default idf is the max of 
        # known idf's
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf, 
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])
    
        return self
    
    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X])          

In [6]:
etree_w2v = Pipeline([("glove vectorizer", MeanEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])        
etree_w2v_tfidf = Pipeline([("glove vectorizer", TfidfEmbeddingVectorizer(glove_small)), 
                        ("extra trees", ExtraTreesClassifier(n_estimators=200))])            
all_models = [("w2v", etree_w2v),("w2v_tfidf", etree_w2v_tfidf),]            
print("model done", type(etree_w2v),type(etree_w2v_tfidf))  

model done <class 'sklearn.pipeline.Pipeline'> <class 'sklearn.pipeline.Pipeline'>


In [7]:
def benchmark(model, X, y):
    test_size = 0.2
    scores = []
    for train, test in StratifiedShuffleSplit(y, n_iter=5, test_size=test_size):
        X_train, X_test = X[train], X[test]
        y_train, y_test = y[train], y[test]
        scores.append(accuracy_score(model.fit(X_train, y_train).predict(X_test), y_test))
    return np.mean(scores)

table = []
for name, model in all_models:
    
        table.append({'model': name, 
                      'accuracy': benchmark(model, X, y)})
df = pd.DataFrame(table)

In [8]:
plt.figure(figsize=(15, 6))
fig = sns.pointplot( y='accuracy', hue='model', 
                    data=df[df.model.map(lambda x: x in ["w2v", "w2v_tfidf"                                                   
                                                        ])])
sns.set_context("notebook", font_scale=1.5)
fig.set(ylabel="accuracy")
fig.set(xlabel="labeled training examples")
fig.set(title="R8 benchmark")
fig.set(ylabel="accuracy")

[<matplotlib.text.Text at 0x1fdfab67f98>]