In [1]:
import pandas as pd
import numpy as np
from collections import Counter

In [2]:
from time import time

In [3]:
df = pd.read_csv('../data/clean/Solr_dsi_final_v1_words.csv')

In [4]:
df = df.dropna(subset =['content'])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 87458 entries, 0 to 87457
Data columns (total 13 columns):
Unnamed: 0    87458 non-null int64
_version_     87458 non-null int64
cache         952 non-null object
segment       87458 non-null int64
digest        87458 non-null object
tstamp        87458 non-null object
url           87458 non-null object
anchor        20496 non-null object
content       87458 non-null object
id            87458 non-null object
title         86449 non-null object
boost         87458 non-null float64
words         87458 non-null object
dtypes: float64(1), int64(3), object(9)
memory usage: 9.3+ MB


In [6]:
dsi_stop_words = ['the', 'blog', 'i', 'in', 'new', 'use', 'a', 'how', 'it', 'like', 'need', 'sign', 'for', 
                  'rss', 'videos', 'view', 'using', 'interview', 'follow', 'read', 'make', 'video',
                  'post', 'comment', 'comments', 'subscribe', 'things', 'just', 'add', 'wise', 'know', 'upcoming', 
                  'people', 'practitioner', 'used', 'developers', 'events', 'companies', 'better', 'terms', 'time',
                  'customer', 'conference', 'tags', 'contact', 'years', 'type', 'value', 'march', 'best', 'does',
                 'live', 'cases', 'way', 'privacy']

In [7]:
from sklearn.datasets import fetch_20newsgroups

from nltk import word_tokenize
from nltk import download
from nltk.corpus import stopwords

In [8]:
stop_words = stopwords.words('english')

In [13]:
import re
def preprocess(text):
    s = text.split()
    text_s = [re.sub('[^a-zA-Z]+', '', t) for t in s]
    text = ' '.join(text_s)
    text = text.lower()
    #print len(text), text
    doc = word_tokenize(text)
    doc = [word for word in doc if word not in stop_words]
    doc = [word for word in doc if word.isalpha()]
    return doc

In [10]:
#samples = df[:100]
texts = df['content']

In [11]:
y = np.zeros(len(texts))

In [14]:
corpus = [preprocess(text) for text in texts]

In [49]:
import json
with open('dsi_corpus.json', 'w') as fp:
    json.dump(corpus, fp)

In [15]:
def filter_docs(corpus, texts, labels, condition_on_doc):
    """
    Filter corpus, texts and labels given the function condition_on_doc which takes
    a doc.
    The document doc is kept if condition_on_doc(doc) is true.
    """
    number_of_docs = len(corpus)

    if texts is not None:
        texts = [text for (text, doc) in zip(texts, corpus)
                 if condition_on_doc(doc)]

    labels = [i for (i, doc) in zip(labels, corpus) if condition_on_doc(doc)]
    corpus = [doc for doc in corpus if condition_on_doc(doc)]

    print("{} docs removed".format(number_of_docs - len(corpus)))

    return (corpus, texts, labels)

In [16]:
corpus, texts, y = filter_docs(corpus, texts, y, lambda doc: (len(doc) != 0))

32 docs removed


In [24]:
sims = {'dsi': {}}

In [None]:
#First method: Latent Semantic Indexing

In [None]:
import numpy as np
from gensim import corpora
from gensim.models import TfidfModel
from gensim.models import LsiModel
from gensim.similarities import MatrixSimilarity

t0 =time()
dictionary = corpora.Dictionary(corpus)
corpus_gensim = [dictionary.doc2bow(doc) for doc in corpus]
tfidf = TfidfModel(corpus_gensim)
corpus_tfidf = tfidf[corpus_gensim]
lsi = LsiModel(corpus_tfidf, id2word=dictionary, num_topics=200)
lsi_index = MatrixSimilarity(lsi[corpus_tfidf])
sims['dsi']['LSI'] = np.array([lsi_index[lsi[corpus_tfidf[i]]]
                                for i in range(len(corpus))])
time() - t0

In [None]:
#Second method: centroid of the word vectors

In [17]:
from gensim.models import Word2Vec, KeyedVectors
word2vec_model = KeyedVectors.load_word2vec_format('../Words_Embedding/dsi_model_format_v1.bin', binary=True)
#word2vec_model = Word2Vec.load_word2vec_format(filename, binary=True)

In [104]:
word2vec_model['data']

array([  4.24171844e-08,   1.74701971e-04,   4.05572109e-08,
         1.07802842e-08,   4.00463586e-11,   1.76642905e-04,
         9.10765213e-12,   2.68017786e-09,   2.64071787e-06,
         4.00578459e-11,   1.65965361e-07,   9.10565026e-12,
         1.06042979e-08,   6.74993601e-07,   6.33692210e-10,
         6.41068698e-10,   4.00463655e-11,   1.70887273e-04,
         5.82776050e-10,   1.72484505e-07,   1.50097570e-19,
         4.05563476e-08,   1.07213936e-08,   4.00463829e-11,
         2.57859956e-09,   5.82860982e-10,   6.70172251e-10,
         1.52632103e-19,   4.10202006e-08,   1.04303579e-08,
         4.00578459e-11,   1.74705638e-04,   2.33138686e-09,
         2.63669220e-09,   1.49253859e-19,   6.33721353e-10,
         2.70032297e-06,   6.29827801e-10,   2.66573918e-09,
         1.50947964e-19,   6.82400241e-07,   6.75081083e-07,
         4.02852196e-11,   1.05628715e-05,   9.10720197e-12,
         1.73740787e-04,   4.36820046e-05,   4.00578459e-11,
         4.39195028e-05,

In [19]:
word2vec_model.init_sims(replace=True)

  self.syn0[i, :] /= sqrt((self.syn0[i, :] ** 2).sum(-1))


In [20]:
def document_vector(word2vec_model, doc):
    # remove out-of-vocabulary words
    doc = [word for word in doc if word in word2vec_model.vocab]
    return np.mean(word2vec_model[doc], axis=0)

In [21]:
def has_vector_representation(word2vec_model, doc):
    """check if at least one word of the document is in the
    word2vec dictionary"""
    return not all(word not in word2vec_model.vocab for word in doc)

In [22]:
t0 = time()
corpus, texts, y = filter_docs(corpus, texts, y,
                               lambda doc: has_vector_representation(word2vec_model, doc))

5071 docs removed


In [25]:
from sklearn.metrics.pairwise import cosine_similarity
sims['dsi']['centroid'] = cosine_similarity(np.array([document_vector(word2vec_model, doc)
                                                       for doc in corpus]))
time() - t0

384.0005340576172

In [29]:
len(sims['dsi']['centroid'])

82355

In [None]:
#Third method: Word Mover's Distance (WMD)

In [None]:
from sklearn.metrics.pairwise import pairwise_distances

A = np.array([[i] for i in range(len(snippets))])

def f(x, y):
    return word2vec_model.wmdistance(snippets[int(x)], snippets[int(y)])

X_wmd_distance_snippets = pairwise_distances(A, metric=f, n_jobs=-1)

In [30]:
def most_similar(i, X_sims, topn=None):
    """return the indices of the topn most similar documents with document i
    given the similarity matrix X_sims"""

    r = np.argsort(X_sims[i])[::-1]
    if r is None:
        return r
    else:
        return r[:topn]

In [44]:
ret = most_similar(15200, sims['dsi']['centroid'], 20)

In [41]:
k = 15200
print df.iloc[k]['title']

Data Science 101: Deep Learning - Theory and Applications - insideBIGDATA


In [45]:
for j in ret:
    print df.iloc[j]['title']

Data Science 101: Deep Learning - Theory and Applications - insideBIGDATA
Quantum Computing
How You Can Use Big Data in Your Small Business | SmartData Collective
client reference | The Big Data Hub
About | IoTivity
Data Surveillance and Prediction - Wizsoft
Telemarketing - Call Centers - Wizsoft
Kumar Vishwas Poem on Pagli Ladki » iShayari
government | FlowingData
Undergraduate Study - School of Operations Research and Information Engineering - Cornell Engineerin
Cisco: White Papers & Case Studies by Cisco - Brand Republic
Legal Information - Wizsoft
Sign in -- Sage
How I Cut The Cable (And How You Can Too!)
AlterWind Log Analyzers comparison. Reports for search engine optimization, web site promotion, web 
Origin | Just another WordPress.com weblog
Maximum Flow implementation on Spark GraphX and Raspberry Pi Spark cluster demo - Data Science & Bus
horizon graph | FlowingData
Comments on: insideBIGDATA Guide to the Retail Industry: Sponsored by Dell and Intel
Database Solutions for Ri

In [51]:
for j in ret:
    print df.iloc[j]['content']
    print '--------------'

Data Science 101: Deep Learning - Theory and Applications - insideBIGDATA About Advertise Contact Search: News Companies Topics Big Data Cloud Analytics Data Science Data Storage Hadoop Infrastructure Machine Learning HPC Visualization White Papers Industry Segments Academic Energy Entertainment Financial Government Healthcare Life Sciences Manufacturing Media Retail Travel Special Sections inside Hadoop inside SPARK Data Science 101 Interviews Use Cases Field Reports Visualization of the Week Big Data Humor Book Review Resources Events Events Calendar Industry Perspectives Jobs Board Job Postings Podcast Research / Reports Video Special Reports Sign up for our newsletter and get the latest big data news and analysis. Email Address Home » Topics » Data Science » Data Science 101: Deep Learning – Theory and Applications Data Science 101: Deep Learning – Theory and Applications December 22, 2015 by Daniel Gutierrez Leave a Comment Deep Learning is a hot topic in statistical learning and 