# Target Visualization - T-SNE and Doc2Vec
Source: https://www.kaggle.com/arthurtok/target-visualization-t-sne-and-doc2vec/notebook

This kernel explore different methods to visualize Near miss/ Non near miss event as the target with tSNE visualization method with two approaches: 

(1) Using terms frequencies (CountVectorizer) or Term Frequency inverse document frequencies => high dimensional space => Truncated SVD method to linear reduce dimensions => tSNE two dimensions visualization  on Laten Semantic Analysis (LSA) feature space 

(2) Word Embeddings method using Doc2VEc => tSNE two dimensions visualization 

In [1]:

# Importing the relevant libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import TfidfVectorizer

from string import punctuation

import re
from functools import reduce

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.models import ColumnDataSource
from bokeh.plotting import figure, show, output_notebook, reset_output
from bokeh.palettes import d3
import bokeh.models as bmo
from bokeh.io import save, output_file

# init_notebook_mode(connected = True)
# color = sns.color_palette("Set2")
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

pd.options.mode.chained_assignment = None
pd.options.display.max_columns = 999
pd.options.display.max_rows = 999

In [2]:
from pathlib import Path

# import data from the group_all_labelled.csv file 
df = pd.read_csv(
    '../../events/group_all_labelled.csv',
    usecols=['event_id','filename', 'group', 'sentence_text','event_text', 'Near Miss Event'])

df['label'] = df['Near Miss Event'].astype(int)
df.head()

Unnamed: 0,event_id,filename,group,sentence_text,event_text,Near Miss Event,label
0,a080918_e9_1443_annual_09_13904956_0,a080918_e9_1443_annual_09_13904956.json,0,following the completion of the hole and loggi...,following the completion of the hole and loggi...,False,0
1,a080918_e9_1443_annual_09_13904956_15,a080918_e9_1443_annual_09_13904956.json,0,photos of core c: yaringa e9_1443_annual_09.do...,mineral drillholes data 2. lithology summary a...,False,0
2,a080918_e9_1443_annual_09_13904956_18,a080918_e9_1443_annual_09_13904956.json,0,introduction the company has identified the on...,several suitable target areas were identified ...,False,0
3,a080918_e9_1443_annual_09_13904956_21,a080918_e9_1443_annual_09_13904956.json,0,parts of the adjacent coolcalalaya rift are al...,the gascoyne platform is a diamond shaped area...,False,0
4,a080918_e9_1443_annual_09_13904956_34,a080918_e9_1443_annual_09_13904956.json,0,"a recent detailed analysis of drilling, seismi...",bromine levels in the halite are high (up to 3...,True,1


In [3]:
#NLP packages 
import string
import spacy
from spacy import displacy



# 1. Data preprocessing

# NLP 

Apply standard NLP steps to process the event text from the input file, including:

* Removing stop words 

* Tokenization

* Lemmatization

In [4]:
# Use spacy packages 
nlp = spacy.load("en_core_web_lg")
        
# Creating our tokenizer function
# https://towardsdatascience.com/building-a-topic-modeling-pipeline-with-spacy-and-gensim-c5dc03ffc619
def lemmatizer(doc):
    # This takes in a doc of tokens from the NER and lemmatizes them. 
    # Pronouns (like "I" and "you" get lemmatized to '-PRON-', so I'm removing those.
    doc = [token.lemma_.lower().strip() for token in doc if token.lemma_ != '-PRON-']
    doc = u' '.join(doc)
    return nlp.make_doc(doc)
    
def remove_stopwords(doc):
    # This will remove stopwords and punctuation.
    # Use token.text to return strings, which we'll need for Gensim.
    doc = [token.text for token in doc if token.is_stop != True and token.is_punct != True]
    return doc

# create our language pipeline
nlp = spacy.load("en_core_web_lg")
nlp.add_pipe(lemmatizer,name='lemmatizer')  # lemmatizer
nlp.add_pipe(remove_stopwords, name="stopwords", last=True)  # remove stopwords + punctuation, return textt

In [5]:
# Apply Spacy functions
df['tokens'] = list(nlp.pipe(df.event_text.values, batch_size=100))  # adjust
df['clean_text'] = df['tokens'].map(lambda tokens: " ".join(tokens))

In [6]:
df.head()

Unnamed: 0,event_id,filename,group,sentence_text,event_text,Near Miss Event,label,tokens,clean_text
0,a080918_e9_1443_annual_09_13904956_0,a080918_e9_1443_annual_09_13904956.json,0,following the completion of the hole and loggi...,following the completion of the hole and loggi...,False,0,"[follow, completion, hole, logging, core, lack...",follow completion hole logging core lack evapo...
1,a080918_e9_1443_annual_09_13904956_15,a080918_e9_1443_annual_09_13904956.json,0,photos of core c: yaringa e9_1443_annual_09.do...,mineral drillholes data 2. lithology summary a...,False,0,"[mineral, drillholes, data, 2, lithology, summ...",mineral drillholes data 2 lithology summary ap...
2,a080918_e9_1443_annual_09_13904956_18,a080918_e9_1443_annual_09_13904956.json,0,introduction the company has identified the on...,several suitable target areas were identified ...,False,0,"[suitable, target, area, identify, area, apply...",suitable target area identify area apply explo...
3,a080918_e9_1443_annual_09_13904956_21,a080918_e9_1443_annual_09_13904956.json,0,parts of the adjacent coolcalalaya rift are al...,the gascoyne platform is a diamond shaped area...,False,0,"[gascoyne, platform, diamond, shape, area, cov...","gascoyne platform diamond shape area cover 86,..."
4,a080918_e9_1443_annual_09_13904956_34,a080918_e9_1443_annual_09_13904956.json,0,"a recent detailed analysis of drilling, seismi...",bromine levels in the halite are high (up to 3...,True,1,"[bromine, level, halite, high, 330ppm, suggest...",bromine level halite high 330ppm suggest preci...


# 2. T-SNE applied to Latent Semantic (LSA) space


To start off we look at the sparse representation of text documents via the Term frequency Inverse document frequency method. What this does is create a matrix representation that upweights locally prevalent but globally rare terms - therefore accounting for the occurence bias when using just term frequencies

In [7]:
tf_idf_vec = TfidfVectorizer(min_df=3,
                             max_features = 60_000, #100_000,
                             analyzer="word",
                             ngram_range=(1,3), # (1,6)
                             stop_words="english")

# fit and transform on all events
tf_idf = tf_idf_vec.fit_transform(list(df['clean_text']))



In [8]:
tf_idf

<1671x12511 sparse matrix of type '<class 'numpy.float64'>'
	with 117946 stored elements in Compressed Sparse Row format>

In [9]:
# Applying the Singular value decomposition to lower dimensionality to from 12511 to 50
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=50, random_state=2018)
svd_tfidf = svd.fit_transform(tf_idf)
print("Dimensionality of LSA space: {}".format(svd_tfidf.shape))
svd_tfidf

Dimensionality of LSA space: (1671, 50)


array([[ 0.1302212 ,  0.0029069 ,  0.00478156, ..., -0.00636913,
        -0.02309202, -0.05838043],
       [ 0.13456788,  0.02247563, -0.1093503 , ...,  0.04276429,
        -0.00907207,  0.04387731],
       [ 0.15944579,  0.00995639, -0.12532973, ...,  0.03330208,
         0.00998658,  0.03126847],
       ...,
       [ 0.12427468,  0.02045139, -0.00677984, ...,  0.04631342,
        -0.03826369,  0.06202897],
       [ 0.11995654,  0.01635003, -0.1626503 , ...,  0.02464767,
        -0.03877046,  0.0249457 ],
       [ 0.21709182, -0.12456673,  0.01016909, ...,  0.00949173,
        -0.00593389, -0.00579167]])

In [10]:
from sklearn.manifold import TSNE

# Importing multicore version of TSNE
#from MulticoreTSNE import MulticoreTSNE as TSNE

In [11]:
tsne_model = TSNE(n_jobs=4,
                  perplexity = 20, #try different perplexity parameters
                  early_exaggeration=4, # Trying out exaggeration trick
                  n_components=2,
                  verbose=1,
                  random_state=2018,
                  n_iter=500)

tsne_tfidf = tsne_model.fit_transform(svd_tfidf)

# Putting the tsne information into a dataframe
tsne_tfidf_df = pd.DataFrame(data=tsne_tfidf, columns=["x", "y"])

# add X values to full df
for col in ['event_id','filename', 'sentence_text', 'event_text', 'label' ]:
    tsne_tfidf_df[col] = df[col].values

# add X values to subsetted df (i.e. for only one GROUP)
# for col in ['filename', 'sentence_text', 'event_text', 'Label', 'group']:
#     tsne_tfidf_df[col] = X[X.loc[X.group == GROUP, col].values

[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.018s...
[t-SNE] Computed neighbors for 1671 samples in 0.122s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 0.121407
[t-SNE] KL divergence after 250 iterations with early exaggeration: 19.850721
[t-SNE] KL divergence after 500 iterations: 1.490429


In [12]:
tsne_tfidf_df

Unnamed: 0,x,y,event_id,filename,sentence_text,event_text,label
0,-0.641562,20.649637,a080918_e9_1443_annual_09_13904956_0,a080918_e9_1443_annual_09_13904956.json,following the completion of the hole and loggi...,following the completion of the hole and loggi...,0
1,-12.595753,10.188814,a080918_e9_1443_annual_09_13904956_15,a080918_e9_1443_annual_09_13904956.json,photos of core c: yaringa e9_1443_annual_09.do...,mineral drillholes data 2. lithology summary a...,0
2,-12.906935,10.144623,a080918_e9_1443_annual_09_13904956_18,a080918_e9_1443_annual_09_13904956.json,introduction the company has identified the on...,several suitable target areas were identified ...,0
3,-20.562954,11.792826,a080918_e9_1443_annual_09_13904956_21,a080918_e9_1443_annual_09_13904956.json,parts of the adjacent coolcalalaya rift are al...,the gascoyne platform is a diamond shaped area...,0
4,-11.966096,10.312630,a080918_e9_1443_annual_09_13904956_34,a080918_e9_1443_annual_09_13904956.json,"a recent detailed analysis of drilling, seismi...",bromine levels in the halite are high (up to 3...,1
...,...,...,...,...,...,...,...
1666,47.653603,-14.015414,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,"hole rbdn002 from 6 18 metres, 12 metres @ 0.3...","the wadi prospect, 100 metres south of the kin...",0
1667,46.533997,0.684765,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,mineralisation in rddn029 is hosted by metased...,diamond drilling was focussed on testing down ...,1
1668,47.700855,0.719766,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,falconbridge completed a program of regional s...,no significant assays were received from this ...,0
1669,-10.917240,36.835827,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,"references ferguson, k m and ruddock, i, 2001 ...",the data has been lodged with doir airborne ge...,0


In [13]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])

# we need a list of length 7 becasue charlie labelled group 6 instead of 5 lol
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_tfidf_df["x"], 
                                      y = tsne_tfidf_df["y"],
                                      color = colormap[tsne_tfidf_df["label"]],
                                      sentence_text = tsne_tfidf_df["sentence_text"],
                                      event_text = tsne_tfidf_df["event_text"],
                                      filename = tsne_tfidf_df["filename"],
                                      event_id = tsne_tfidf_df["event_id"],
                                      Label = tsne_tfidf_df["label"]))
TOOLTIPS = [
    ("event_id","@event_id"),
    ("filename", "@filename"),
    ("event_text", "@event_text"),
    ("Label","@Label"), 
    ("group", "@group")
]


plot_tfidf = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Tfidf_SVD space")

plot_tfidf.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "Label",
                   source = source,
                   alpha = 0.7,
                   radius = 0.4)

show(plot_tfidf)

In [14]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])

# we need a list of length 7 becasue charlie labelled group 6 instead of 5 lol
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_tfidf_df["x"], 
                                      y = tsne_tfidf_df["y"],
                                      color = colormap[tsne_tfidf_df["label"]],
                                      sentence_text = tsne_tfidf_df["sentence_text"],
                                      event_text = tsne_tfidf_df["event_text"],
                                      event_id = tsne_tfidf_df["event_id"],
                                      filename = tsne_tfidf_df["filename"],
                                      Label = tsne_tfidf_df["label"]))
TOOLTIPS = [
     ("event_id","@event_id"),
    ("filename", "@filename"),
#    ("sentence_text", "@sentence_text"),  # show centre sentence of text chunk
    ("event_text", "@event_text"), # show full text chunk
    ("Label","@Label"),
   
]


plot_tfidf = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Tfidf_SVD space")

plot_tfidf.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "label",
                   source = source,
                   alpha = 0.7,
                   radius = 0.35)  # adjust scatter point size

show(plot_tfidf)

# 3. T-SNE applied on Doc2Vec embedding


In [15]:
# train model for Doc2Vec embedding
from gensim.models.doc2vec import Doc2Vec, TaggedDocument


def get_doc2vec(model, df):
    vecs = []
    for idx in df.index:
        try:
            vec = model.docvecs[idx]
        except:
            vec = np.nan
        vecs.append(vec)
    return vecs

model_path = 'doc2vec.model'

# Storing the question texts in a list
event_texts = df.clean_text.tolist()

# Creating a list of terms and a list of labels to go with it
documents = [TaggedDocument(doc, tags=[str(i)]) for i, doc in enumerate(event_texts)]
max_epochs = 100

    #Note: dm defines the training algorithm.
    # If dm=1 means ‘distributed memory’ (PV-DM) and dm =0 means ‘distributed bag of words’ (PV-DBOW).
model = Doc2Vec(vector_size=50, alpha=0.025, min_alpha=0.00025, min_count=2, dm=1, epochs=max_epochs, workers=4)
model.build_vocab(documents)
model.train(documents, total_examples=model.corpus_count, epochs=model.epochs)

In [16]:
# Creating and fitting the tsne model to the document embeddings
tsne_model = TSNE(n_jobs=4,
                perplexity=50, #try to tweak perplexity to get better visualization
                  early_exaggeration=4,
                  n_components=2,
                  verbose=1,
                  random_state=2018,
                  n_iter=300)

# fit alll
#tsne_d2v = tsne_model.fit_transform(model.docvecs.vectors_docs)

tsne_d2v = tsne_model.fit_transform(model.docvecs.vectors_docs)

# Putting the tsne information into sq
tsne_d2v_df = pd.DataFrame(data=tsne_d2v, columns=["x", "y"])

# add X values to full df
for col in ['event_id','filename', 'sentence_text', 'event_text', 'label']:
    tsne_d2v_df[col] = df[col].values

[t-SNE] Computing 151 nearest neighbors...
[t-SNE] Indexed 1671 samples in 0.012s...
[t-SNE] Computed neighbors for 1671 samples in 0.120s...
[t-SNE] Computed conditional probabilities for sample 1000 / 1671
[t-SNE] Computed conditional probabilities for sample 1671 / 1671
[t-SNE] Mean sigma: 1.909270
[t-SNE] KL divergence after 250 iterations with early exaggeration: 18.202614
[t-SNE] KL divergence after 300 iterations: 2.202189


In [17]:
output_notebook()

# colormap = np.array(["#6d8dca", "#d07d3c"])
colormap = np.array(["darkblue", "red", "purple", "green", "orange", "yellow", "yellow"])

# palette = d3["Category10"][len(tsne_tfidf_df["asset_name"].unique())]
source = ColumnDataSource(data = dict(x = tsne_d2v_df["x"], 
                                      y = tsne_d2v_df["y"],
                                      color = colormap[tsne_d2v_df["label"]],
                                      event_text = tsne_d2v_df["event_text"],
                                      sentence_text = tsne_d2v_df['sentence_text'],
                                      event_id = tsne_d2v_df["event_id"],
                                      filename = tsne_d2v_df["filename"],
                                      Label = tsne_d2v_df["label"]))

TOOLTIPS = [
     ("event_id","@event_id"),
    ("filename", "@filename"),
    ("sentence_text", "@sentence_text"),
    ("event_text", "@event_text"),
    ("Label","@Label"),
   
]

plot_d2v = bp.figure(plot_width = 800, plot_height = 700, tooltips=TOOLTIPS,
                       title = "T-SNE applied to Doc2vec document embeddings")

plot_d2v.scatter(x = "x", 
                   y = "y", 
                   color="color",
                   legend = "label",
                   source = source,
                   alpha = 0.7,
                   radius = 0.15)

show(plot_d2v)

Takeaways from the plots

In both plots, there are a lot of overlapping between the Near Miss and Non Near Miss datapoints even with different perplexity for tSNE model. 

