# Alternative Notebook while other notebook runs the model

In [35]:
# %%capture
print("Versions")
# Data science
import pandas as pd
print(f"Pandas: {pd.__version__}")
import numpy as np
print(f"Numpy: {np.__version__}")
# Deep Learning 
import tensorflow as tf
print(f"Tensorflow: {tf.__version__}")
from tensorflow import keras
print(f"Keras: {keras.__version__}")
import sklearn
print(f"Sklearn: {sklearn.__version__}")
# Visualization 
import seaborn as sns
import matplotlib.pyplot as plt
sns.set(style="ticks", color_codes=True)
# Regular Expressions
import re
print(f're: {re.__version__}')
# Preprocessing
import spacy
print(f'spaCy: {spacy.__version__}')
import nltk
print(f'nltk: {nltk.__version__}')
from spellchecker import SpellChecker
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import casual_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
# Word Embeddings
import gensim
print(f'gensim: {gensim.__version__}')
# Emoji analysis and language detection
from spacymoji import Emoji
import spacy_cld
# Cool progress bars
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()  # Enable tracking of execution progress
# For loading and saving objects
import pickle

Versions
Pandas: 1.0.1
Numpy: 1.18.1
Tensorflow: 2.0.0
Keras: 2.2.4-tf
Sklearn: 0.22.1
re: 2.2.1
spaCy: 2.1.8
nltk: 3.4.5
gensim: 3.8.0


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

  from pandas import Panel


In [7]:
!ls

Embeddings.ipynb                     processed_X_train.pkl
Natural Language Understanding.ipynb ten_clust.csv
[1m[36mdata[m[m                                 tests.py
[1m[36mmatt_chatbot[m[m                         words_rank.csv
processed_X_test.pkl                 words_rank_postprocessed.csv


In [None]:
# Loading in the data

In [8]:
processed_X_train = pd.read_pickle('processed_X_train.pkl')
processed_X_test = pd.read_pickle('processed_X_test.pkl')

In [9]:
processed_X_train.head()

38343    [ever, since, late, watch, o, update, get, not...
26943    [work, apple, support, get, result, paid, deve...
18403    [many, stupid, bug, new, update, ready, throw,...
19883    [take, 7, plus, charger, get, ready, work, tim...
87560    [log, itunes, account, lap-fucking-top, god, g...
Name: inbound_text, dtype: object

In [10]:
processed_X_test.head()

39227    [possibile, save, photo, dropbox, photo, app, ...
96901    [can, not, seem, make, genius, bar, appt, onli...
32619    [swear, ..., 3rd, io, update, make, iphone, ba...
71706    [phone, tell, wifi, password, network, wrong, ...
478      [wire, thing, update, 6, gig, available, 10, k...
Name: inbound_text, dtype: object

# Doc2Vec Embedding

Word2Vec uses Continuous Bag of Words, which creates a sliding window around each word to predict it from context (surrouding words), and the Skip Gram model.

https://medium.com/wisio/a-gentle-introduction-to-doc2vec-db3e8c0cce5e

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [17]:
data = processed_X_train.iloc[0]

In [19]:
tagged_data = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(data)]

In [20]:
tagged_data

[TaggedDocument(words=['ever'], tags=['0']),
 TaggedDocument(words=['since'], tags=['1']),
 TaggedDocument(words=['late'], tags=['2']),
 TaggedDocument(words=['watch'], tags=['3']),
 TaggedDocument(words=['o'], tags=['4']),
 TaggedDocument(words=['update'], tags=['5']),
 TaggedDocument(words=['get'], tags=['6']),
 TaggedDocument(words=['notification'], tags=['7']),
 TaggedDocument(words=['mindfully'], tags=['8']),
 TaggedDocument(words=['breath'], tags=['9']),
 TaggedDocument(words=['get'], tags=['10']),
 TaggedDocument(words=['3'], tags=['11']),
 TaggedDocument(words=['5'], tags=['12']),
 TaggedDocument(words=['per'], tags=['13']),
 TaggedDocument(words=['day'], tags=['14']),
 TaggedDocument(words=['fix'], tags=['15'])]

Now that the tagging is finished, let's start training our model!

In [21]:
max_epochs = 100
vec_size = 20
alpha = 0.025

model = Doc2Vec(size=vec_size, 
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=1,
                dm =1)
  
model.build_vocab(tagged_data)

for epoch in range(max_epochs):
    print('iteration {0}'.format(epoch))
    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # Decrease the learning rate
    model.alpha -= 0.0002
    # Fix the learning rate, no decay
    model.min_alpha = model.alpha

model.save("d2v.model")
print("Model Saved")



iteration 0
iteration 1
iteration 2
iteration 3
iteration 4
iteration 5
iteration 6
iteration 7
iteration 8
iteration 9
iteration 10
iteration 11
iteration 12
iteration 13
iteration 14
iteration 15
iteration 16
iteration 17
iteration 18
iteration 19
iteration 20
iteration 21
iteration 22
iteration 23
iteration 24
iteration 25
iteration 26
iteration 27
iteration 28
iteration 29
iteration 30
iteration 31
iteration 32
iteration 33
iteration 34
iteration 35
iteration 36
iteration 37
iteration 38
iteration 39
iteration 40
iteration 41
iteration 42
iteration 43
iteration 44
iteration 45
iteration 46
iteration 47
iteration 48
iteration 49
iteration 50
iteration 51
iteration 52
iteration 53
iteration 54
iteration 55
iteration 56
iteration 57
iteration 58
iteration 59
iteration 60
iteration 61
iteration 62
iteration 63
iteration 64
iteration 65
iteration 66
iteration 67
iteration 68
iteration 69
iteration 70
iteration 71
iteration 72
iteration 73
iteration 74
iteration 75
iteration 76
iteration

In [None]:
model= Doc2Vec.load("d2v.model")
#to find the vector of a document which is not in training data
test_data = word_tokenize("I love chatbots".lower())
v1 = model.infer_vector(test_data)
print("V1_infer", v1)

# to find most similar doc using tags
similar_doc = model.docvecs.most_similar('1')
print(similar_doc)

In [31]:
# to find vector of doc in training data using tags or in other words, printing the vector of document at index 1 in training data
model.docvecs['1']

array([ 0.00853298, -0.02657316,  0.02473178, -0.00826399, -0.01995665,
        0.01941   , -0.00422948,  0.02943846, -0.00559517,  0.0094566 ,
       -0.03073188,  0.01469939, -0.0368831 ,  0.013537  ,  0.01832891,
        0.02808696,  0.01232831, -0.02331029, -0.02963574,  0.02320767],
      dtype=float32)

# Continue this tomorrow with Govind

In [38]:
!ls objects

labels_grand.pkl      processed_X_train.pkl
processed_X_test.pkl  wcss-kmeans.pkl


In [39]:
with open('objects/wcss-kmeans.pkl', 'rb') as handle:
    wcss_grand = pickle.load(handle)
with open('objects/labels_grand.pkl','rb') as handle:
    labels_grand = pickle.load(handle)

In [37]:
wcss_grand

{'X_train_cv_wcss': [530645.972276302,
  515872.56591591635,
  506263.25096308935,
  499273.0325938495,
  494721.5635948987,
  490252.59452755644,
  485571.38124904723,
  483391.18722595525,
  480814.251332943,
  478320.6774778482],
 'X_train_tfidf_wcss': [51187.78134353667,
  50629.84828632547,
  50215.28652104986,
  49864.89596652504,
  49569.48339937552,
  49331.49470316796,
  49092.011408560604,
  48905.286083337814,
  48707.45932094955,
  48543.9464381612]}

In [40]:
labels_grand

{'X_train_cv_labels': [array([3, 0, 8, ..., 0, 7, 4], dtype=int32),
  array([ 0, 15, 13, ..., 15,  1, 12], dtype=int32),
  array([26, 16, 25, ..., 14,  2, 13], dtype=int32),
  array([ 3,  8, 19, ...,  8, 32,  2], dtype=int32),
  array([ 3,  8, 19, ...,  8, 32, 28], dtype=int32),
  array([35, 11,  2, ..., 46, 36, 59], dtype=int32),
  array([35, 11,  2, ..., 46, 36, 59], dtype=int32),
  array([15, 11,  2, ..., 46, 36, 59], dtype=int32),
  array([ 3, 54, 16, ..., 60, 67, 46], dtype=int32),
  array([ 3, 54, 90, ..., 60, 67, 46], dtype=int32)],
 'X_train_tfidf_labels': [array([8, 4, 3, ..., 4, 3, 1], dtype=int32),
  array([ 8, 14,  1, ...,  4,  1, 17], dtype=int32),
  array([ 4, 14, 21, ..., 28, 21, 11], dtype=int32),
  array([26, 21, 29, ..., 28, 20, 38], dtype=int32),
  array([49, 14, 25, ..., 15, 48, 11], dtype=int32),
  array([37, 36, 54, ...,  5, 50, 38], dtype=int32),
  array([37, 36, 54, ..., 67, 43, 38], dtype=int32),
  array([ 0,  7, 77, ..., 22, 72, 19], dtype=int32),
  array([80,

# DBSCAN
Density-based spatial clustering of applications with noise.

1. Choose a point at random
2. If that point is connected to greater than n points within d distance, it is a core sample. If not, it is classed as noise, and another point is chosen
3. Assign it, and all points within d distance as belonging to cluster x
4. Check all points just marked 0 if any are core cluseters, repeat step 3-4

In [None]:
scaler = StandardScaler()
scaler.fit()
X = scaler.transform()

In [None]:
for e in [0.05, 0.1, 0.3, 0.5, 0.75, 1, 3, 5]:
    
    # Instantiate DBSAN object
    my_dbscan = DBSCAN(eps = e, min_samples = 5)
    my_dbscan.fit(X)
    
    # Count up number of noise points
    n_noise = np.sum(my_dbscan.labels_ < 0)
    
    # Plot out the points colored by label
    plt.figure()
    plt.scatter(c = my_dbscan.labels_)
    plt.title(f'Clusters with eps={e}, {n_noise} noise points')
    plt.show()

# LDA Topic Modelling

https://medium.com/nanonets/topic-modeling-with-lsa-psla-lda-and-lda2vec-555ff65b0b05

# LDA2VEC 
A newer, deep-learning based approach