In [1]:
# Read from CSV file.

import pandas as pd

df = pd.read_csv('./clean_text.csv').sample(frac=1).reset_index()

print(len(df.index))
df['clean_text'].head()

9110


0                     hacker designed kill people know
1               story sticker general moment came life
2    major trump administration tell federal appeal...
3                         lee genius superhero creator
4                     life perfect thankful everything
Name: clean_text, dtype: object

In [2]:
# Create TF-IDF (1 text).

import numpy as np

tfidf = df['clean_text'][0:1].apply(lambda text: pd.value_counts(text.split(" "))).sum(axis=0).reset_index()
tfidf.columns = ['words', 'tf']
for i, word in enumerate(tfidf['words']):
    tfidf.loc[i, 'idf'] = np.log(df.shape[0] / len(df[df['clean_text'].str.contains(word)]))
tfidf['tf-idf'] = tfidf['tf'] * tfidf['idf']

tfidf.head()

Unnamed: 0,words,tf,idf,tf-idf
0,hacker,1,5.288487,5.288487
1,people,1,2.98373,2.98373
2,know,1,3.308986,3.308986
3,designed,1,5.651392,5.651392
4,kill,1,4.868633,4.868633


In [3]:
# Create TF-IDF (all texts).

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
matrix = vectorizer.fit_transform(df['clean_text'])
tfidf = pd.DataFrame(matrix.toarray())
tfidf.columns = vectorizer.get_feature_names()

tfidf.head()

Unnamed: 0,abide,ability,able,aboard,absence,absolute,absolutely,abstain,abstract,abundance,...,yesterday,yet,yield,york,young,younger,youth,zero,zone,zoo
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# Train word2vec model.

# from gensim.scripts.glove2word2vec import glove2word2vec

# glove_input_file = 'glove.twitter.27B.100d.txt'
# glove_output_file = 'glove.twitter.27B.100d.word2vec'

# glove2word2vec(glove_input_file, glove_output_file)

In [5]:
# Load word2vec model.

from gensim.models import KeyedVectors

glove_output_file = 'glove.twitter.27B.100d.word2vec'
model = KeyedVectors.load_word2vec_format(glove_output_file, binary=False)

In [6]:
# Create word embeddings.

embeddings = {}
for word in vectorizer.get_feature_names():
    try:
        embeddings[word] = model[word]
    except Exception:
        print('No mapping for', word)

embeddings

No mapping for instructress
No mapping for nonproduction
No mapping for pleasantness
No mapping for propellant
No mapping for signless
No mapping for sledder
No mapping for suborbital
No mapping for wishless


{'abide': array([ 0.46581  ,  0.42211  ,  0.22422  ,  0.82108  , -0.43116  ,
         0.18205  , -0.16291  , -1.0801   , -0.29351  ,  0.65358  ,
        -0.39729  , -0.73855  , -1.2513   , -0.27769  , -0.025439 ,
        -0.42061  , -0.094182 ,  1.019    ,  0.56854  , -0.45576  ,
         0.45006  ,  0.72047  , -0.27799  , -0.27897  , -0.5108   ,
         0.93302  ,  0.64565  ,  0.53983  , -0.0059393,  0.47068  ,
         0.0036054,  0.73621  ,  0.15819  ,  0.32443  , -0.29647  ,
         0.24127  , -0.26377  ,  0.2916   ,  0.013446 ,  0.25593  ,
         0.4242   , -0.15234  ,  0.8882   ,  0.11734  ,  0.768    ,
        -0.78761  ,  0.12849  , -0.57026  ,  0.4993   , -0.23081  ,
         0.083178 , -0.45304  , -0.29042  ,  0.41229  , -1.5814   ,
        -1.1877   ,  0.2704   ,  0.41373  , -0.68143  ,  0.8882   ,
         0.23967  ,  0.72708  , -1.1151   , -0.35221  , -0.23394  ,
         0.70419  , -0.30911  ,  0.32363  , -0.82294  ,  0.24568  ,
        -0.69415  , -0.43644  , -0.9884