In [1]:
# Read from CSV file.

import pandas as pd

df = pd.read_csv('./clean_text.csv').sample(frac=1).reset_index()

print(len(df.index))
df['clean_text'].head()

9090


0                       take risk take fall want worth
1    president trump demand steel barrier along bor...
2                                           well weird
3    thought family friend colleague passing loss c...
4    share update inspiring day thought leader indu...
Name: clean_text, dtype: object

In [2]:
# Create TF-IDF (1 text).

import numpy as np

tfidf = df['clean_text'][0:1].apply(lambda text: pd.value_counts(text.split(" "))).sum(axis=0).reset_index()
tfidf.columns = ['words', 'tf']
for i, word in enumerate(tfidf['words']):
    tfidf.loc[i, 'idf'] = np.log(df.shape[0] / len(df[df['clean_text'].str.contains(word)]))
tfidf['tf-idf'] = tfidf['tf'] * tfidf['idf']

tfidf.head()

Unnamed: 0,words,tf,idf,tf-idf
0,take,2,3.569753,7.139505
1,want,1,3.961639,3.961639
2,risk,1,5.936876,5.936876
3,worth,1,5.713733,5.713733
4,fall,1,5.22311,5.22311


In [3]:
# Create TF-IDF (all texts).

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(df['clean_text'])
tfidf = pd.DataFrame(matrix.toarray())
tfidf.columns = vectorizer.get_feature_names()

tfidf.head()

Unnamed: 0,abide,ability,able,aboard,abortion,absence,absolute,absolutely,abstain,abundance,...,yesterday,yet,yield,york,young,yoy,zero,zip,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Train word2vec model.

from gensim.scripts.glove2word2vec import glove2word2vec

glove_input_file = 'glove.6B.100d.txt'
glove_output_file = 'glove.6B.100d.word2vec'

glove2word2vec(glove_input_file, glove_output_file)

(400000, 100)

In [5]:
# Load word2vec model.

from gensim.models import KeyedVectors

model = KeyedVectors.load_word2vec_format(glove_output_file, binary=False)

In [6]:
# Create word embeddings.

embeddings = {}
for word in vectorizer.get_feature_names():
    try:
        embeddings[word] = model[word]
    except Exception:
        print('No mapping for', word)

embeddings

No mapping for instructress
No mapping for nonproduction
No mapping for signless
No mapping for wishless


{'abide': array([ 0.060709 , -0.56275  ,  0.019162 ,  0.14465  , -0.52792  ,
         0.29845  , -0.72699  ,  0.84775  ,  0.084771 ,  0.37012  ,
        -0.15887  , -0.057563 ,  1.3134   , -0.11969  , -1.0545   ,
        -0.1652   ,  0.34165  ,  0.63698  , -0.89455  , -0.42239  ,
         0.28508  , -0.79749  ,  0.17691  , -0.04654  , -0.071773 ,
         0.1187   , -0.70294  , -0.89851  ,  0.31437  , -0.33514  ,
         0.26814  ,  0.35679  ,  0.019971 ,  0.29902  , -0.13943  ,
         0.55649  ,  0.78055  , -1.0524   , -0.78299  , -0.23598  ,
        -0.28508  ,  0.85992  ,  0.45294  ,  0.29103  , -0.55289  ,
        -0.45005  , -0.34672  , -0.18736  , -0.71243  , -1.0294   ,
         0.68814  ,  0.35451  ,  0.35617  , -0.0019833,  0.059631 ,
        -0.0029684,  1.1579   , -0.11614  ,  0.77923  , -0.31925  ,
        -0.021383 , -0.88937  , -1.1872   , -0.5677   ,  0.19601  ,
         0.42696  , -0.44178  ,  1.1775   , -0.34295  , -0.59526  ,
         0.12521  , -0.15675  ,  0.1405