***Feature Extraction:***
  
  1. Bag of Words
  2. TFIDF vectorizer
  3. Word2Vec

**In this section we will talk about feature extraction examples of text data.**

In [6]:
# required imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# First lines from novel 'Tale of two cities' by Charles Dickens
# Source: The Project Gutenberg
tale_of_cities = ['it was the best of times',
                  'it was the worst of times',
                  'it was the age of wisdom',
                  'it was the age of foolishness',
                  'it was the epoch of belief'  
                  ]

# Convert a collection of text documents to a matrix of number of features
vectorizer = CountVectorizer()
features = vectorizer.fit_transform(tale_of_cities)

In [3]:
# Printing the identified Unique words along with their indices 
print("Vocabulary: ", vectorizer.vocabulary_)

Vocabulary:  {'it': 5, 'was': 9, 'the': 7, 'best': 2, 'of': 6, 'times': 8, 'worst': 11, 'age': 0, 'wisdom': 10, 'foolishness': 4, 'epoch': 3, 'belief': 1}


In [4]:
print('The feature names: ', vectorizer.get_feature_names())
print('Number of features: ', len(vectorizer.get_feature_names()))
print('The vectorized text (Encoded): ')
print(features.toarray())

The feature names:  ['age', 'belief', 'best', 'epoch', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
Number of features:  12
The vectorized text (Encoded): 
[[0 0 1 0 0 1 1 1 1 1 0 0]
 [0 0 0 0 0 1 1 1 1 1 0 1]
 [1 0 0 0 0 1 1 1 0 1 1 0]
 [1 0 0 0 1 1 1 1 0 1 0 0]
 [0 1 0 1 0 1 1 1 0 1 0 0]]


In [8]:
tale_of_cities = ['it was the best of times',
                  'it was the worst of times',
                  'it was the age of wisdom',
                  'it was the age of foolishness',
                  'it was the epoch of belief'  
                  ]

In [9]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_features = tfidf_vectorizer.fit_transform(tale_of_cities)
print(tfidf_vectorizer.get_feature_names())
print(tfidf_features.shape)

['age', 'belief', 'best', 'epoch', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
(5, 12)


In [10]:
print('The feature names: ', tfidf_vectorizer.get_feature_names())
print('Number of features: ', len(tfidf_vectorizer.get_feature_names()))
print('The vectorized text (Encoded): ')
print(tfidf_features.toarray())

The feature names:  ['age', 'belief', 'best', 'epoch', 'foolishness', 'it', 'of', 'the', 'times', 'was', 'wisdom', 'worst']
Number of features:  12
The vectorized text (Encoded): 
[[0.         0.         0.62510433 0.         0.         0.29786556
  0.29786556 0.29786556 0.50433024 0.29786556 0.         0.        ]
 [0.         0.         0.         0.         0.         0.29786556
  0.29786556 0.29786556 0.50433024 0.29786556 0.         0.62510433]
 [0.50433024 0.         0.         0.         0.         0.29786556
  0.29786556 0.29786556 0.         0.29786556 0.62510433 0.        ]
 [0.50433024 0.         0.         0.         0.62510433 0.29786556
  0.29786556 0.29786556 0.         0.29786556 0.         0.        ]
 [0.         0.5863888  0.         0.5863888  0.         0.27941741
  0.27941741 0.27941741 0.         0.27941741 0.         0.        ]]


**Word2Vec embeddings**

In [20]:
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec

# common texts data from gensim library.
common_texts

[['human', 'interface', 'computer'],
 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 ['eps', 'user', 'interface', 'system'],
 ['system', 'human', 'system', 'eps'],
 ['user', 'response', 'time'],
 ['trees'],
 ['graph', 'trees'],
 ['graph', 'minors', 'trees'],
 ['graph', 'minors', 'survey']]

In [25]:
# Word2Vec model 
model = Word2Vec(common_texts, size=100, window=5, min_count=1)

# Vector for a specific word, for ex: human 
model.wv["human"]

array([ 3.43388971e-03, -3.61608784e-03,  4.48620087e-03, -4.94819973e-03,
       -8.77382117e-04,  2.89989356e-03,  1.67383300e-03,  3.65505158e-03,
        1.05970865e-03, -1.77398499e-03, -3.17351150e-05, -1.00901467e-03,
       -4.70360555e-03,  2.45962106e-03, -8.47141142e-04,  4.22295090e-03,
       -3.32458131e-03, -2.70062871e-03,  3.02404957e-03, -2.85309227e-03,
        4.08883300e-03, -2.22729053e-03, -1.20713876e-03,  2.73951446e-03,
        7.23199802e-04, -2.36160099e-03, -4.77821007e-03, -1.00123005e-04,
       -1.68618335e-05,  2.86461227e-03,  2.54028972e-04, -2.92142574e-03,
       -9.71496222e-04, -3.78758716e-03, -9.86306812e-04, -1.51610945e-03,
       -1.18273031e-03, -2.92744080e-04,  9.44134837e-04, -2.54136883e-03,
       -1.26092217e-03,  1.43130543e-03, -3.20353522e-03, -4.49500978e-03,
        3.64622171e-03,  2.72988564e-05, -1.49934110e-03,  4.62230807e-03,
        4.85059991e-03,  4.40129777e-04,  3.59853124e-03, -3.15364194e-03,
       -1.91330467e-03, -

In [26]:
# Calculate the most similar words to human
model.wv.most_similar("human")

  if np.issubdtype(vec.dtype, np.int):


[('response', 0.08395902812480927),
 ('graph', 0.07479733973741531),
 ('time', 0.06893133372068405),
 ('survey', 0.04686649888753891),
 ('system', 0.031810905784368515),
 ('computer', 0.020856063812971115),
 ('minors', -0.02121073007583618),
 ('eps', -0.06385811418294907),
 ('user', -0.07140786200761795),
 ('interface', -0.07448561489582062)]

In [19]:
from gensim.models import Word2Vec
import spacy
nlp = spacy.load('en_core_web_sm')

# define training data
corpus = ['it was the best of times',
            'it was the worst of times',
            'it was the age of wisdom',
            'it was the age of foolishness',
            'it was the epoch of belief']

sentences = list() 
for sen in corpus:
  x = nlp(sen)
  y = list()
  for token in x:
    y.append(token.text)
  sentences.append(y)

print(sentences)

# train model
model = Word2Vec(sentences, min_count=1)
# summarize the loaded model
print(model)
# summarize vocabulary
words = list(model.wv.vocab)
print(words)

# access vector for one word
print('Vector for a word in one of the sentences')
print(model['wisdom'])
# save model
model.save('model.bin')
# load model
new_model = Word2Vec.load('model.bin')
print(new_model)

[['it', 'was', 'the', 'best', 'of', 'times'], ['it', 'was', 'the', 'worst', 'of', 'times'], ['it', 'was', 'the', 'age', 'of', 'wisdom'], ['it', 'was', 'the', 'age', 'of', 'foolishness'], ['it', 'was', 'the', 'epoch', 'of', 'belief']]
Word2Vec(vocab=12, size=100, alpha=0.025)
['it', 'was', 'the', 'best', 'of', 'times', 'worst', 'age', 'wisdom', 'foolishness', 'epoch', 'belief']
vector for a word in one of the sentences
[ 9.1444037e-04  7.7333819e-04  5.2312732e-04  1.0214958e-05
 -4.7556385e-03 -2.6045598e-03 -1.1960843e-03 -3.4238556e-03
 -1.9484337e-03 -3.6921909e-03 -4.5418143e-03  2.5264039e-03
 -4.0701227e-03 -2.7972648e-03  4.1562091e-03 -8.6244848e-04
  4.1778055e-03 -2.9510350e-04 -4.5203120e-03 -2.7840114e-03
  1.0026008e-03  1.8192318e-03  1.6008198e-04  4.8879418e-03
  2.7620923e-04  3.5321589e-03 -4.2924713e-03 -4.3855542e-03
 -4.5332210e-03  3.5528119e-03 -3.8095668e-03 -6.3364644e-04
 -1.8129537e-03  4.3134266e-03  4.3559084e-03  2.7797751e-03
  4.5660986e-03  3.6586011e-0

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


  """Entry point for launching an IPython kernel.


In [16]:
words = list(model.wv.vocab)
print(words)

['it', 'was', 'the', 'best', 'of', 'times', 'worst', 'age', 'wisdom', 'foolishness', 'epoch', 'belief']
