In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sys
import re
from sklearn.manifold import TSNE
sys.path.append('/home/machao/anaconda2/lib/python2.7/site-packages')
import gensim
from gensim import corpora
from nltk.corpus import stopwords
% matplotlib inline

from gensim.models.word2vec import LineSentence  
from gensim.models import Word2Vec  

In [23]:
data = pd.read_csv('../train.csv')
stoplist = set(stopwords.words("english"))

In [26]:
def clean_sentence(val):
    val = str(val)
    if not val:
        return ''
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in stoplist:
            sentence.remove(word)  
            
    sentence = " ".join(sentence)
    return sentence

def clean_dataframe(data):
#     data = data.dropna(how="any")
    
    for col in ['question1', 'question2']:
        data[col] = data[col].map(clean_sentence)
    return data


In [27]:
data = clean_dataframe(data)
data.head(15)
print data.shape

(404290, 6)


In [28]:
sentences1 = data['question1'].values
sentences2 = data['question2'].values
sentences = np.concatenate((sentences1, sentences2),axis=0)
sentences.shape

(808580,)

In [29]:
with open('sentences', 'w') as f:
    for sentence in sentences:
        f.write(str(sentence) + '\n')

In [30]:
texts = [[word for word in str(sentence).lower().split() if word not in stoplist] for sentence in sentences]

In [31]:
dictionary = corpora.Dictionary(texts)
dictionary.id2token = {value:key for key,value in dictionary.token2id.items()}
dictionary.save('sentences.dict')
# dictionary = corpora.Dictionary.load('sentences.dict')
# print dictionary.token2id

In [32]:
corpus = [dictionary.doc2bow(text) for text in texts]
corpora.MmCorpus.serialize('corpus.mm', corpus)

from gensim.models import TfidfModel 
tfidf = TfidfModel(corpus)
tfidf.save('tfidf')

In [33]:
sentences = LineSentence('sentences')
word2Vec = Word2Vec(sentences, size=128, window=5, min_count=5, workers=8)
word2Vec.save('w2c')

In [34]:
def handel_sentence(word2Vec, tfidf, sentence):
    the_text = [word for word in str(sentence).lower().split() if word not in stoplist]
    the_corpus = dictionary.doc2bow(the_text)
    weight = { i[0]:i[1] for i in tfidf[the_corpus]}
    res = np.zeros(128)
    for id, _ in the_corpus:
        word = dictionary.id2token[id]
        if word in word2Vec:
            res += weight[id] * word2Vec[word]
    return res   

def transform():
    values = np.zeros([data.shape[0], 256])
    for i in data.index:
        v1 = handel_sentence(word2Vec, tfidf, data.loc[i, 'question1'])
        v2 = handel_sentence(word2Vec, tfidf, data.loc[i, 'question2'])
        values[i, :] = np.concatenate((v1, v2))
    return pd.DataFrame(values)
    
res_data = transform()

In [35]:
res_data['label'] = data['is_duplicate']
res_data.to_csv('done_data.csv', index = False)

In [None]:
def tsne_plot(model):
    "Creates and TSNE model and plots it"
    labels = []
    tokens = []

    for word in model.wv.vocab:
        tokens.append(model[word])
        labels.append(word)
    
    tsne_model = TSNE(perplexity=40, n_components=2, init='pca', n_iter=2500, random_state=23)
    new_values = tsne_model.fit_transform(tokens)

    x = []
    y = []
    for value in new_values:
        x.append(value[0])
        y.append(value[1])
        
    plt.figure(figsize=(16, 16)) 
    for i in range(len(x)):
        plt.scatter(x[i],y[i])
        plt.annotate(labels[i],
                     xy=(x[i], y[i]),
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
    plt.show()
tsne_plot(word2Vec)

In [None]:
res_data