In [5]:
import gensim
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.svm import SVR
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict

train_df = pd.read_csv('resources/train.csv', usecols = ['target', 'comment_text'])

In [6]:
def clean_sentence(text):
    text = text.lower()
    tokens = word_tokenize(text)    
    return tokens

class MeanEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.dim = word2vec['hello'].shape[0]

    def fit(self, X, y):
        return self

    def transform(self, X):
        return np.array([
            np.mean([self.word2vec[w] for w in words if w in self.word2vec]
                    or [np.zeros(self.dim)], axis=0)
            for words in X
        ])

class TfidfEmbeddingVectorizer(object):
    def __init__(self, word2vec):
        self.word2vec = word2vec
        self.word2weight = None
        self.dim = word2vec['hello'].shape[0]

    def fit(self, X, y):
        tfidf = TfidfVectorizer(analyzer=lambda x: x)
        tfidf.fit(X)
        max_idf = max(tfidf.idf_)
        self.word2weight = defaultdict(
            lambda: max_idf,
            [(w, tfidf.idf_[i]) for w, i in tfidf.vocabulary_.items()])

        return self

    def transform(self, X):
        return np.array([
                np.mean([self.word2vec[w] * self.word2weight[w]
                         for w in words if w in self.word2vec] or
                        [np.zeros(self.dim)], axis=0)
                for words in X
            ])


In [8]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('resources/GoogleNews-vectors-negative300-SLIM.bin.gz', binary= True)


In [9]:
X_train = train_df['comment_text']
y = train_df['target']

In [14]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /Users/petersaur/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [15]:
X = list(map(clean_sentence, X_train))

In [None]:
w2v_pipe = Pipeline([
    ("word2vec vectorizer", MeanEmbeddingVectorizer(w2v)),
    ("extra trees", SVR())])
w2v_pipe.fit(X,y)

In [None]:
w2v_pipe.predict([X[4]])

In [None]:
y[4]

In [None]:
print(X[4])

In [None]:
w2v['guy']

In [None]:
w2v['something']

In [None]:
etree_w2v_tfidf = Pipeline([
    ("word2vec vectorizer", TfidfEmbeddingVectorizer(w2v)),
    ("extra trees", SVR())])
    
etree_w2v_tfidf.fit(X,y)

In [None]:
etree_w2v_tfidf.predict([X[4]])