In [1]:
import numpy as np
import pandas as pd
import multiprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix, log_loss
from sklearn.linear_model import LogisticRegression
import gensim
from gensim import utils
from gensim.models.doc2vec import TaggedDocument, Doc2Vec
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
from tqdm import tqdm
from random import shuffle
from utils import labelize_reviews, get_learned_vectors
import matplotlib.pyplot as plt

tqdm.pandas(desc="progress-bar")

## Paragraph Vector (Doc2Vec)

In this notebook, we'll explore the [Paragraph Vector](https://cs.stanford.edu/~quocle/paragraph_vector.pdf) a.k.a Dov2Vec algorithm on ~3 million Yelp reviews. Doc2Vec is an extension to word2vec for learning document embeddings and basically acts as  if a document has another floating word-like vector, which contributes to all training predictions, and is updated like other word-vectors, but we will call it a doc-vector. Gensim’s Doc2Vec class implements this algorithm.

To recap, Word2Vec is a model from 2013 that embeds words in a lower-dimensional vector space using a shallow neural network. The result is a set of word-vectors where vectors close together in vector space have similar meanings based on context, and word-vectors distant to each other have differing meanings

There are two approaches within `doc2vec:` `dbow` and `dmpv`. 

`dbow (Paragraph Vector - Distributed Bag of Words)` works in the same way as `skip-gram` in word2vec ,except that the input is replaced by a special token representing the document (i.e. $v_{wI}$ is a vector representing the document). In this architecture, the order of words in the document is ignored; hence the name distributed bag of words. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a center word based an average of both context word-vectors and the full document's doc-vector.

`dmpv (Paragraph Vector - Distributed Memory)` works in a similar way to `cbow` in word2vec. For the input, dmpv introduces an additional document token in addition to multiple target words. Unlike cbow, however, these vectors are not summed but concatenated (i.e. $v_{wI}$ is a concatenated vector containing the document token and several target words). The objective is again to predict a context word given the concatenated document and word vectors. The doc-vectors are obtained by training a neural network on the synthetic task of predicting a target word just from the full document's doc-vector. (It is also common to combine this with skip-gram testing, using both the doc-vector and nearby word-vectors to predict a single target word, but only one at a time.) There are 2 DM models, specifically: 
*  one which averages context vectors (dm_mean)
*  one which concatenates them (dm_concat, resulting in a much larger, slower, more data-hungry model)


In [2]:
df = pd.read_csv('allcat_clean_reviews.csv',index_col=0)
df.head()

Unnamed: 0,reviews,target
0,the rooms are big but the hotel is not good as...,0
1,second time with ocp saturday night pm not bus...,0
2,food is still great since they remodeled but t...,0
3,dirty location and very high prices but they d...,0
4,so first the off stood outside for mins to try...,0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3085663 entries, 0 to 3086007
Data columns (total 2 columns):
 #   Column   Dtype 
---  ------   ----- 
 0   reviews  object
 1   target   int64 
dtypes: int64(1), object(1)
memory usage: 70.6+ MB


In [4]:
SEED = 1000

x = df.reviews
y = df.target

#defining our training, validation and test set
x_train, x_validation_test, y_train, y_validation_test = train_test_split(x, y, test_size=.06, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_test, y_validation_test, test_size=.5, random_state=SEED)

In [5]:

print('The Training set has {0} reviews with {1:.2f}% negative, {2:.2f}% positive reviews'.format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1))*100))

print('The Validation set has {0} entries with {1:.2f}% negative, {2:.2f}% positive reviews'.format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1))*100))

print('The test set has a total of {0} reviews with {1:.2f}% negative, {2:.2f}% positive reviews'.format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1))*100))

The Training set has 2900523 reviews with 50.00% negative, 50.00% positive reviews
The Validation set has 92570 entries with 50.06% negative, 49.94% positive reviews
The test set has a total of 92570 reviews with 49.94% negative, 50.06% positive reviews


Now, we label each review with a unique ID using Gensim's `TaggedDocument()` function. Then, we'll concatenate the training and validation and test sets for word representation. For training, I have decided to use the whole data set. The rationale behind this is that the Doc2Vec training is completely unsupervised (unlabelled) and thus there is no need to hold out any data.

In [6]:
df = pd.DataFrame()
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 0 entries
Empty DataFrame

In [7]:
%%time
from utils import labelize_reviews
full = pd.concat([x_train,x_validation,x_test])
full_tagged = list(labelize_reviews(full,'all'))

Wall time: 1min 23s


In [8]:
%%time
cores = multiprocessing.cpu_count() #12

init_kwargs = dict(
    vector_size=150, epochs=10, min_count=2,
    sample=0, workers=cores, negative=5, hs=0,
    alpha=0.05, min_alpha=0.0001, window=5
)
#The learning rate, alpha decreases linearly per epoch from the initial rate to the minimum rate. I will use alpha = 0.0025 and min_alpha = 0.0001 as implemented by Le and Mikolov
#plain DBOW
model_dbow = Doc2Vec(dm=0, **init_kwargs)

model_dbow.build_vocab(full_tagged)

Wall time: 13min 36s


In [13]:
%%time
model_dbow.build_vocab(full_tagged)

Wall time: 15min 5s


In [9]:
%%time
model_dbow.train(full_tagged, total_examples=len(full_tagged), epochs=model_dbow.epochs)

Wall time: 1h 29min 23s


In [10]:
model_dbow.save("dbow.model")

In [8]:
import os
dbow_path = os.path.join("~\Desktop\DATA\ghdata\Yelp\model",'dbow','dbow.model')
print(dbow_path)

~/Desktop/DATA/ghdata/Yelp/model\dbow\dbow.model


In [24]:
model_dbow = Doc2Vec.load("dbow/dbow.model")

https://towardsdatascience.com/multi-class-text-classification-with-doc2vec-logistic-regression-9da9947b43f4

In [8]:
%%time
cores = multiprocessing.cpu_count() #12

dmm_kwargs = dict(
    vector_size=200, epochs=10, min_count=2,
    sample=0, workers=cores, negative=5, hs=0,
    alpha=0.05, min_alpha=0.0001, window=5
)

dmc_kwargs = dict(
    vector_size=200, epochs=10, min_count=2,
    sample=0, workers=cores, negative=5, hs=0,
    alpha=0.05, min_alpha=0.0001, window=3
)
#Distributed Memory (mean)
model_dmm = Doc2Vec(dm=1, dm_mean=1, **dmm_kwargs)
    
# Distributed Memory(Concatenation)
model_dmc = Doc2Vec(dm=1, dm_concat=1, **dmc_kwargs)

model_dmm.build_vocab(full_tagged)
model_dmc.build_vocab(full_tagged)

Wall time: 26min 13s


In [16]:
%%time
model_dmm.train(full_tagged, total_examples=len(full_tagged), epochs=model_dmm.epochs)

Wall time: 2h 30min 7s


In [19]:
model_dmm.save("dmm/dmm.model")

In [17]:
%%time
model_dmc.train(full_tagged, total_examples=len(full_tagged), epochs=model_dmc.epochs)

Wall time: 1h 41min 21s


In [20]:
model_dmc.save("dmc/dmc.model")

# Sentiment Classification with DBOW, DMM (Mean), DMC (Concatenation)

Given a document, our Doc2Vec models output a vector representation of the document. How useful is a particular model? In case of sentiment classification, we want the ouput vector to reflect the sentiment in the input document. So, in vector space, positive documents should be distant from negative documents.



In [18]:
def get_learned_vectors(model,corpus):
    """
    A function that extracts document vectors from a TRAINED Doc2Vec model
    
    model: Trained Doc2Vec model 
    """
    vecs = [model.docvecs['all_'+str(ind)] for ind, doc in corpus.iteritems()]
    
    return vecs

In [8]:
%%time
model_dbow = Doc2Vec.load("dbow.model")
model_dm_avg = Doc2Vec.load("dm_avg.model")
model_dm_cat = Doc2Vec.load("dm_cat.model")

Wall time: 27.1 s


# DBOW Unigram

In [25]:
%%time
train_vecs_dbow = get_learned_vectors(model_dbow, x_train)
validation_vecs_dbow = get_learned_vectors(model_dbow, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow, y_train)

y_pred = clf.predict_proba(validation_vecs_dbow)

logloss_dbow = log_loss(y_validation, y_pred)
acc= clf.score(validation_vecs_dbow, y_validation)
print("Validation Logloss:", logloss_dbow, "\nValidation Accuracy:", acc)

Validation Logloss: 0.28415165036804385 
Validation Accuracy: 0.881981203413633
Wall time: 1min 19s


# DMM Unigram

In [22]:
%%time
train_vecs_dmm = get_learned_vectors(model_dmm, x_train)
validation_vecs_dmm = get_learned_vectors(model_dmm, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dmm, y_train)

y_pred = clf.predict_proba(validation_vecs_dmm)

logloss_dmm = log_loss(y_validation, y_pred)
acc_dmm = clf.score(validation_vecs_dmm, y_validation)
print("Validation Logloss:", logloss_dmm, "\nValidation Accuracy:", acc_dmm)

Validation Logloss: 0.2901750885047062 
Validation Accuracy: 0.8821756508588096
Wall time: 4min 29s


# DMC Unigram

In [23]:
%%time
train_vecs_dmc = get_learned_vectors(model_dmc, x_train)
validation_vecs_dmc = get_learned_vectors(model_dmc, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dmc, y_train)

y_pred = clf.predict_proba(validation_vecs_dmc)

logloss_dmc = log_loss(y_validation, y_pred)
acc_dmc = clf.score(validation_vecs_dmc, y_validation)
print("Validation Logloss:", logloss_dmc, "\nValidation Accuracy:", acc_dmc)

Validation Logloss: 0.6930191077491353 
Validation Accuracy: 0.5046343307767095
Wall time: 2min 28s


### Le and Mikolov notes that combining a paragraph vector from Distributed Bag of Words (DBOW) and Distributed Memory (DM) improves performance. We will follow, pairing the models together for evaluation. Here, we concatenate the paragraph vectors obtained from each model with the help of a thin wrapper class included in a gensim test module.

In [32]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
dbow_dmm = ConcatenatedDoc2Vec([model_dbow, model_dmm])
dbow_dmc = ConcatenatedDoc2Vec([model_dbow, model_dmc])

## DBOW + DMM

In [33]:
train_vecs_dbow_dmm = get_learned_vectors(dbow_dmm,x_train)
validation_vecs_dbow_dmm = get_learned_vectors(dbow_dmm, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_dmm,y_train)

y_pred = clf.predict_proba(validation_vecs_dbow_dmm)
logloss_dbowdmm = log_loss(y_validation,y_pred)
acc_dbowdmm = clf.score(validation_vecs_dbow_dmm, y_validation)
print("Validation Logloss:", logloss_dbowdmm, "\nValidation Accuracy:", acc_dbowdmm)

Validation Logloss: 0.21934084701063036 
Validation Accuracy: 0.9125418602138922


## DBOW + DMC

In [35]:
train_vecs_dbow_dmc = get_learned_vectors(dbow_dmc,x_train)
validation_vecs_dbow_dmc = get_learned_vectors(dbow_dmc, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_dmc,y_train)

y_pred = clf.predict_proba(validation_vecs_dbow_dmc)
logloss_dbowdmc = log_loss(y_validation,y_pred)
acc_dbowdmc = clf.score(validation_vecs_dbow_dmc, y_validation)
print("Validation Logloss:", logloss_dbowdmc, "\nValidation Accuracy:", acc_dbowdmc)

Validation Logloss: 0.28430528204900746 
Validation Accuracy: 0.881916387598574


#### As we can see, 

In [98]:
model_dm_avg.wv.most_similar("wonderful")

[('pleasant', 0.7412006258964539),
 ('fantastic', 0.718704342842102),
 ('phenomenal', 0.6945322751998901),
 ('awesome', 0.6865039467811584),
 ('superb', 0.6821604371070862),
 ('sloppy', 0.678415060043335),
 ('incredible', 0.6747259497642517),
 ('speedy', 0.6609603762626648),
 ('awful', 0.6602469682693481),
 ('amazing', 0.6575909852981567)]

In [125]:
model_dm_avg.wv.most_similar("good")

[('tasty', 0.7437097430229187),
 ('decent', 0.6630314588546753),
 ('cool', 0.6535404920578003),
 ('solid', 0.570975124835968),
 ('inconsistent', 0.5689284801483154),
 ('bad', 0.549915075302124),
 ('cheap', 0.5468442440032959),
 ('yummy', 0.5432661771774292),
 ('scary', 0.5387668013572693),
 ('great', 0.5234598517417908)]

In [130]:
model_dm_avg.wv.most_similar("dinner")

[('brunch', 0.7673641443252563),
 ('lunch', 0.7425373792648315),
 ('supper', 0.6696999669075012),
 ('starters', 0.6494625210762024),
 ('searching', 0.6484907865524292),
 ('meeting', 0.6315172910690308),
 ('breakfast', 0.6313809156417847),
 ('graduation', 0.6271716356277466),
 ('dessert', 0.6249339580535889),
 ('annual', 0.6146324872970581)]

# Phrase Modeling
Another thing that can be implemented with Gensim library is phrase detection. It is similar to n-gram, but instead of getting all the n-gram by sliding the window, it detects frequently-used phrases and sticks them together.

$$\frac{{count(A B)}-{count_{min}}} {{count(A)} \times {count(B)}} \times \text{N} \gt \text{threshhold} $$

where:

count(A) is the number of times token A appears in the corpus <br/>
count(B) is the number of times token B appears in the corpus <br/>
count(A B) is the number of times the tokens A B appear in the corpus in order <br/>
N is the total size of the corpus vocabulary <br/>
count_{min} is a user-defined parameter to ensure that accepted phrases occur a minimum number of times <br/>
threshold is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase (default threshold used in Gensim's Phrases function is 10.0)

In [7]:
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser

In [13]:
tok_train = [r.split() for r in x_train]

In [21]:
%%time
phrases = Phrases(tok_train)
bigram = Phraser(phrases)

Wall time: 9min 39s


In [22]:
def labelize_reviews_bg(reviews,label):
    result = []
    prefix = label
    for i, j in zip(reviews.index,reviews):
        result.append(TaggedDocument(bigram[j.split()],[prefix + '_%s' % i ]))
    return result

In [23]:
%%time
full_tagged_bg= pd.concat([x_train,x_validation,x_test])
full_taggedw2v_bg = labelize_reviews_bg(full_tagged_bg, 'all')

Wall time: 42min 27s


In [24]:
shuffle(full_taggedw2v_bg)

In [20]:
len(full_taggedw2v_bg)

3085663

# DBOW Bigram

In [21]:
%%time
#plain DBOW bigram

model_dbow_bg = Doc2Vec(dm=0, vector_size=150, negative=5, min_count=2, workers=cores, alpha=0.05,sample=0)
model_dbow_bg.build_vocab([x for x in tqdm(full_taggedw2v_bg)])

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2578879.76it/s]


Wall time: 4min 5s


In [24]:
%%time
for epoch in range(30):
    model_dbow_bg.train([x for x in tqdm(full_taggedw2v_bg)], total_examples=len(full_taggedw2v_bg), epochs=1)
    model_dbow_bg.alpha -= 0.002
    model_dbow_bg.min_alpha = model_dbow_bg.alpha

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2898672.12it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3141000.97it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3176412.14it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3131359.32it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 3038423.07it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3143782.02it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2777146.03it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 3027322.34it/s]
100%|███████████████████████████████████

Wall time: 2h 12min 37s


In [27]:
model_dbow_bg.save("dbow_bg.model")

In [28]:
#model_dbow = Doc2Vec.load("dbow_bg.model")
train_vecs_dbow_bg = get_vectors(model_dbow_bg,x_train)
validation_vecs_dbow_bg = get_vectors(model_dbow_bg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_bg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dbow_bg)
logloss_dbow_bg = log_loss(y_validation,y_pred)
logloss_dbow_bg

0.2091432079255557

In [29]:
%%time
cores = multiprocessing.cpu_count() #12
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

#Distributed Memory (mean) BIGRAM
model_dmm_bg = Doc2Vec(dm=1, dm_mean=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)
    
#Distributed Memory (Concatenation) BIGRAM
model_dmc_bg = Doc2Vec(dm=1,dm_concat=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)

#model_dm_cat1 = Doc2Vec(dm=1,dm_concat=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)

model_dmm_bg.build_vocab([x for x in tqdm(full_taggedw2v_bg)])
model_dmc_bg.build_vocab([x for x in tqdm(full_taggedw2v_bg)])

100%|████████████████████████████████████████████████████████████████████| 3085663/3085663 [00:09<00:00, 338270.15it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 1849920.26it/s]


Wall time: 30min 10s


# DMM BIGRAM

In [32]:
%%time

for epoch in range(30):
    model_dmm_bg.train([x for x in tqdm(full_taggedw2v_bg)], total_examples=len(full_taggedw2v_bg), epochs=1)
    model_dmm_bg.alpha -= 0.002
    model_dmm_bg.min_alpha = model_dmm_bg.alpha

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2733320.86it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2835982.76it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2795023.60it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2823996.35it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2614992.80it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2815304.00it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2835841.08it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2918933.28it/s]
100%|███████████████████████████████████

Wall time: 3h 26min 17s


In [36]:
model_dmm_bg.save("dbow_dmm_bg.model")

In [8]:
model_dbow_bg = Doc2Vec.load("dbow_bg.model")

In [9]:
model_dmm_bg = Doc2Vec.load("dbow_dmm_bg.model")

In [11]:
train_vecs_dmm_bg = get_vectors(model_dmm_bg,x_train)
validation_vecs_dmm_bg = get_vectors(model_dmm_bg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dmm_bg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dmm_bg)
logloss_dmm_bg = log_loss(y_validation,y_pred)
logloss_dmm_bg

0.625711287309628

# DMC BIGRAM

In [25]:
%%time
cores = multiprocessing.cpu_count() #12

#Distributed Memory (Concatenation) BIGRAM
model_dmc_bg = Doc2Vec(dm=1,dm_concat=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)
model_dmc_bg.build_vocab([x for x in tqdm(full_taggedw2v_bg)])

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2796892.45it/s]


Wall time: 26min 6s


In [26]:
%%time

for epoch in range(30):
    model_dmc_bg.train([x for x in tqdm(full_taggedw2v_bg)], total_examples=len(full_taggedw2v_bg), epochs=1)
    model_dmc_bg.alpha -= 0.002
    model_dmc_bg.min_alpha = model_dmc_bg.alpha

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2551664.09it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2191942.19it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2244352.04it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2126928.11it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 1938741.82it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2460026.39it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2089500.38it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2709848.93it/s]
100%|███████████████████████████████████

Wall time: 4h 33min 32s


In [27]:
model_dmc_bg.save("dmc_bg.model")

In [28]:
train_vecs_dmc_bg = get_vectors(model_dmc_bg,x_train)
validation_vecs_dmc_bg = get_vectors(model_dmc_bg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dmc_bg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dmc_bg)
logloss_dmc_bg = log_loss(y_validation,y_pred)
logloss_dmc_bg

0.693094161261186

In [10]:
model_dmc_bg = Doc2Vec.load("dmc_bg.model")

In [12]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
dbow_dmm_bg = ConcatenatedDoc2Vec([model_dbow_bg, model_dmm_bg])
dbow_dmc_bg = ConcatenatedDoc2Vec([model_dbow_bg, model_dmc_bg])

In [17]:
train_vecs_dbow_dmm_bg = get_vectors(dbow_dmm_bg,x_train)
validation_vecs_dbow_dmm_bg = get_vectors(dbow_dmm_bg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_dmm_bg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dbow_dmm_bg)
logloss_dbowdmm_bg = log_loss(y_validation,y_pred)
logloss_dbowdmm_bg

0.2069170050900326

In [18]:
train_vecs_dbow_dmc_bg = get_vectors(dbow_dmc_bg,x_train)
validation_vecs_dbow_dmc_bg = get_vectors(dbow_dmc_bg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_dmc_bg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dbow_dmc_bg)
logloss_dbowdmc_bg = log_loss(y_validation,y_pred)
logloss_dbowdmc_bg

0.2091432070267747

# Trigram

If we run the same phrase detection again on bigram detected corpus, now it will detect trigram phrases.

In [29]:
%%time
phrases_tri = Phrases(bigram[tok_train])
trigram = Phraser(phrases_tri)

Wall time: 23min 15s


In [30]:
def labelize_reviews_tg(reviews,label):
    result = []
    prefix = label
    for i, j in zip(reviews.index,reviews):
        result.append(TaggedDocument(trigram[bigram[j.split()]],[prefix + '_%s' % i ]))
    return result

In [31]:
%%time
full_tagged_tg= pd.concat([x_train,x_validation,x_test])
full_taggedw2v_tg = labelize_reviews_tg(full_tagged_tg, 'all')

Wall time: 1h 13min 29s


In [32]:
shuffle(full_taggedw2v_tg)

In [41]:
%%time
#plain DBOW trigram

model_dbow_tg = Doc2Vec(dm=0, vector_size=150, negative=5, min_count=2, workers=cores, alpha=0.05,sample=0)
model_dbow_tg.build_vocab([x for x in tqdm(full_taggedw2v_tg)])

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2338471.73it/s]


Wall time: 41min 22s


In [42]:
%%time
for epoch in range(30):
    model_dbow_tg.train([x for x in tqdm(full_taggedw2v_tg)], total_examples=len(full_taggedw2v_tg), epochs=1)
    model_dbow_tg.alpha -= 0.002
    model_dbow_tg.min_alpha = model_dbow_tg.alpha

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2237168.28it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2906597.94it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2897456.64it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2958828.47it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2945361.86it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2973237.54it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2408721.71it/s]
100%|████████████████████████████████████████████████████████████████████| 3085663/3085663 [00:03<00:00, 779209.18it/s]
100%|███████████████████████████████████

Wall time: 2h 39min 43s


In [44]:
model_dbow_tg.save("dbow_tg.model")

In [22]:
model_dbow_tg = Doc2Vec.load("dbow_tg.model")

In [17]:
#model_dbow = Doc2Vec.load("dbow_bg.model")
train_vecs_dbow_tg = get_vectors(model_dbow_tg,x_train)
validation_vecs_dbow_tg = get_vectors(model_dbow_tg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_tg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dbow_tg)
logloss_dbow_tg = log_loss(y_validation,y_pred)
logloss_dbow_tg

0.20806212875146537

In [33]:
%%time
cores = multiprocessing.cpu_count() #12
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"

#Distributed Memory (mean) BIGRAM
model_dmm_tg = Doc2Vec(dm=1, dm_mean=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)
    
#Distributed Memory (Concatenation) BIGRAM
model_dmc_tg = Doc2Vec(dm=1,dm_concat=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)

#model_dm_cat1 = Doc2Vec(dm=1,dm_concat=1, vector_size=150, window=8, negative=5, hs=0, min_count=2, workers=cores, alpha=0.05, min_alpha=0.0001,epochs=30, sample=0)

model_dmm_tg.build_vocab([x for x in tqdm(full_taggedw2v_tg)])
model_dmc_tg.build_vocab([x for x in tqdm(full_taggedw2v_tg)])

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2673284.84it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2413416.43it/s]


Wall time: 35min 12s


In [38]:
%%time

for epoch in range(30):
    model_dmm_tg.train([x for x in tqdm(full_taggedw2v_tg)], total_examples=len(full_taggedw2v_tg), epochs=1)
    model_dmm_tg.alpha -= 0.002
    model_dmm_tg.min_alpha = model_dmm_tg.alpha

100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2678458.84it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2565954.22it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2664673.24it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3095948.87it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2407243.69it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3092953.11it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:01<00:00, 2981727.40it/s]
100%|███████████████████████████████████████████████████████████████████| 3085663/3085663 [00:00<00:00, 3143782.02it/s]
100%|███████████████████████████████████

Wall time: 3h 1min 48s


In [39]:
model_dmm_tg.save("dmm_tg.model")

In [19]:
model_dmm_tg = Doc2Vec.load("dmm_tg.model")

In [20]:
train_vecs_dmm_tg = get_vectors(model_dmm_tg,x_train)
validation_vecs_dmm_tg = get_vectors(model_dmm_tg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dmm_tg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dmm_tg)
logloss_dmm_tg = log_loss(y_validation,y_pred)
logloss_dmm_tg

0.3275229428405004

In [23]:
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
dbow_dmm_tg = ConcatenatedDoc2Vec([model_dbow_tg, model_dmm_tg])
#dbow_dmc_tg = ConcatenatedDoc2Vec([model_dbow_tg, model_dmc_tg])

In [24]:
train_vecs_dbow_dmm_tg = get_vectors(dbow_dmm_tg,x_train)
validation_vecs_dbow_dmm_tg = get_vectors(dbow_dmm_tg, x_validation)

clf = LogisticRegression(solver="liblinear")
clf.fit(train_vecs_dbow_dmm_tg,y_train)
#clf.score(validation_vecs_dbow,y_validation)
y_pred = clf.predict_proba(validation_vecs_dbow_dmm_tg)
logloss_dbowdmm_tg = log_loss(y_validation,y_pred)
logloss_dbowdmm_tg

0.19275270351977944