# Pre-trained Dataset with Doc2Vec
### 1. Data import
### 2. Create a wordlist of all inputs
### 3. Load the model and the keyedVector
### 4. Compares the words of the embedding with the words of our text
### 5. Find the words, that appear in our text, but not in the embedding-model
### 6. Test if different punctuations are contained in the embedding-model
### 7. Print the most frequent words to get insides 
### 8. Infer vectors
### 9. Create dataframe, add score and save output

**1. Data import**


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
# CSVLInk = r'/content/drive/My Drive/test2_token_excludeSpecPuct.csv'
CSVLInk = r'/content/drive/My Drive/tokenTrue_remStpwrdsTrue_stemmTrue_lemmatizeFalse_nGramFalse_nGram_length2.csv'


In [0]:
import pandas as pd
import io
import time

hotelData = pd.read_csv(CSVLInk) 
review = hotelData["Review"].apply(lambda row: row.strip("']['").split("', '"))
score = hotelData["Reviewer_Score"]

**2. Create a wordlist of all inputs**

In [0]:
# Source: https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/comments
from tqdm import tqdm
tqdm.pandas()

def build_vocab(sentences, verbose =  True):
    """
    :param sentences: list of list of words
    :return: dictionary of words and their count
    """
    vocab = {}
    for sentence in tqdm(sentences, disable = (not verbose)):
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [0]:
vocab = build_vocab(review)

100%|██████████| 515738/515738 [00:02<00:00, 256631.88it/s]


**3. Load the model and the keyedVector**

In [0]:
modelLNK = r'/content/drive/My Drive/enwiki_dbow/doc2vec.bin'

In [0]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.test.utils import get_tmpfile
from gensim.models import KeyedVectors
import gensim.models as g
import time

firstTime = time.time()
#load model

model = g.Doc2Vec.load(modelLNK)

embeddings_index = model.wv

print ("--- %s seconds ---" % round(time.time()-firstTime,4))

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


--- 57.7371 seconds ---




**4. Compares the words of the embedding with the words of our text**

In [0]:
# source https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings/comments
import operator 

def check_coverage(vocab,embeddings_index):
    a = {}
    oov = {}
    k = 0
    i = 0
    for word in tqdm(vocab):
        try:
            a[word] = embeddings_index[word]
            k += vocab[word]
        except:

            oov[word] = vocab[word]
            i += vocab[word]
            pass
    print("\n")
    print('Found embeddings for {:.2%} of vocab'.format(len(a) / len(vocab)))
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]

    return sorted_x

In [0]:
oov = check_coverage(vocab,embeddings_index)

100%|██████████| 81291/81291 [00:00<00:00, 330542.77it/s]



Found embeddings for 54.03% of vocab
Found embeddings for  99.39% of all text





**5. Find the words, that appear in our text, but not in the embedding-model**

In [0]:
oov[:10]

[('wouldn', 3350),
 ('helpfull', 2309),
 ('confortable', 1218),
 ('hadn', 1163),
 ('wasnt', 1141),
 ('spotlessly', 1059),
 ('couldnt', 814),
 ('30am', 803),
 ('30pm', 554),
 ('coffe', 536)]

**6. Test if different punctuations are contained in the embedding-model**

In [0]:
import string
punctuationList = string.punctuation 
punctuationList = punctuationList+"’" + "”" + "“"+"—"
for punctuation in punctuationList:
    print(punctuation, " in the embedding: ",punctuation in vocab)

!  in the embedding:  False
"  in the embedding:  False
#  in the embedding:  False
$  in the embedding:  False
%  in the embedding:  False
&  in the embedding:  False
'  in the embedding:  False
(  in the embedding:  False
)  in the embedding:  False
*  in the embedding:  False
+  in the embedding:  False
,  in the embedding:  False
-  in the embedding:  False
.  in the embedding:  False
/  in the embedding:  False
:  in the embedding:  False
;  in the embedding:  False
<  in the embedding:  False
=  in the embedding:  False
>  in the embedding:  False
?  in the embedding:  False
@  in the embedding:  False
[  in the embedding:  False
\  in the embedding:  False
]  in the embedding:  False
^  in the embedding:  False
_  in the embedding:  False
`  in the embedding:  False
{  in the embedding:  False
|  in the embedding:  False
}  in the embedding:  False
~  in the embedding:  False
’  in the embedding:  False
”  in the embedding:  False
“  in the embedding:  False
—  in the embedding:

In [0]:
import string
punctuationList = string.punctuation 
punctuationList = punctuationList+"’" + "”" + "“"+"—"
for punctuation in punctuationList:
    print(punctuation, " in the embedding: ",punctuation in embeddings_index)

!  in the embedding:  True
"  in the embedding:  False
#  in the embedding:  True
$  in the embedding:  True
%  in the embedding:  True
&  in the embedding:  True
'  in the embedding:  True
(  in the embedding:  False
)  in the embedding:  False
*  in the embedding:  True
+  in the embedding:  True
,  in the embedding:  True
-  in the embedding:  True
.  in the embedding:  True
/  in the embedding:  True
:  in the embedding:  True
;  in the embedding:  True
<  in the embedding:  True
=  in the embedding:  True
>  in the embedding:  True
?  in the embedding:  True
@  in the embedding:  True
[  in the embedding:  False
\  in the embedding:  True
]  in the embedding:  False
^  in the embedding:  True
_  in the embedding:  True
`  in the embedding:  True
{  in the embedding:  False
|  in the embedding:  True
}  in the embedding:  False
~  in the embedding:  True
’  in the embedding:  False
”  in the embedding:  False
“  in the embedding:  False
—  in the embedding:  False


**7. Print the most frequent words to get insides**

In [0]:
for i in range(20):
    print(embeddings_index.index2entity[i])

the
,
.
of
and
in
a
to
was
''
``
is
for
-rrb-
-lrb-
as
on
with
by
he


**8. Infer vectors**

In [0]:
firstTime = time.time()

#inference hyper-parameters # Sorce Paper 
start_alpha=0.01
min_alpha = 0.0001
infer_epoch=750

vectors = []
for text in review.tolist():
    vectors.append(model.infer_vector(text, alpha= start_alpha, min_alpha= min_alpha, epochs= infer_epoch))
print ("--- %s seconds ---" % round(time.time()-firstTime,4))#2.36245275

--- 44.1035 seconds ---


**9. Create dataframe, add score and save output**

In [0]:
df = pd.DataFrame(vectors)
df["Reviewer_Score"] = score

In [0]:
df.to_csv("/content/drive/My Drive/Pretrained.csv", index=False)