***Experiments with  NLP***

In [1]:
#function to hide cells
from IPython.display import HTML
import random

def hide_toggle(for_next=False):
    this_cell = """$('div.cell.code_cell.rendered.selected')"""
    next_cell = this_cell + '.next()'

    toggle_text = 'Toggle show/hide'  # text shown on toggle link
    target_cell = this_cell  # target cell to control with toggle
    js_hide_current = ''  # bit of JS to permanently hide code in current cell (only when toggling next cell)

    if for_next:
        target_cell = next_cell
        toggle_text += ' next cell'
        js_hide_current = this_cell + '.find("div.input").hide();'

    js_f_name = 'code_toggle_{}'.format(str(random.randint(1,2**64)))

    html = """
        <script>
            function {f_name}() {{
                {cell_selector}.find('div.input').toggle();
            }}

            {js_hide_current}
        </script>

        <a href="javascript:{f_name}()">{toggle_text}</a>
    """.format(
        f_name=js_f_name,
        cell_selector=target_cell,
        js_hide_current=js_hide_current, 
        toggle_text=toggle_text
    )

    return HTML(html)

hide_toggle()

***Importing:***

In [2]:
import warnings

warnings.filterwarnings('ignore')

hide_toggle()

In [3]:
import numpy as np
import pandas as pd
import gzip
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from collections import Counter
import string

import nltk
from nltk import *
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist
import re

from gensim import models, corpora

#import polyglot
#from polyglot.text import Text as T
nltk.download()
hide_toggle()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


***Uploading necesary data:   
in this case it's Amazon users reviews***

In [4]:
path = 'qa_Appliances.json.gz'


def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('qa_Appliances.json.gz')

#data_answers = df['answer']
data_answers = df['answer'].apply(lambda x: x.lower())
#data_answers = str(data_answers)
data_answers

hide_toggle()

In [5]:
data_answers.head(5)

0     i replaced my old one with this without a hitch.
1    this may help insinkerator model badger-1: bad...
2    plumbing connections will vary with different ...
3    it does not come with a power cord. it does co...
4    check if you dropped something inside.usually ...
Name: answer, dtype: object

***Preprocessing:***

In [6]:
#cleaning and processing

#cleaning from punctuation
def replace_punctuation(x):  # From S.Lott's solution
    for c in string.punctuation:
        x=x.replace(c,"")
    return x

#removing numbers
def clean_numbers(x):
    x = str(x)
    re.sub(r'\d+', '', x)
    return x

hide_toggle()

***Word frequency (top N words):***

In [7]:

#top N words     всего текста
def top_N_words(N):
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf.fit(data_answers)
    X = tfidf.transform(data_answers)
    idx = np.ravel(X.sum(axis=0).argsort(axis=1))[::-1][:N]
    top_n_words = np.array(tfidf.get_feature_names())[idx].tolist()
    return top_n_words


hide_toggle()

In [8]:
top_N_words(10)

['yes',
 'model',
 'fit',
 'don',
 'sure',
 'does',
 'know',
 'just',
 'work',
 'filter']

***Sentiment analysis:***

In [9]:
#sentiment analysis
#for english version we'll use vader sentiment analyzer
#TODO: make russian language classifier

def emotions(data_answers):
    data_answers = str(data_answers)
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
    sid = SentimentIntensityAnalyzer()
    sentences = tokenize.sent_tokenize(data_answers)
    scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)])
    for sentence in sentences:
        ss = sid.polarity_scores(sentence)
        for k in sorted(ss):
            scores[k] += ss[k]

    return scores 


hide_toggle()

In [10]:
emotions(data_answers)

{'compound': 6.4398,
 'neg': 0.21400000000000002,
 'neu': 20.912999999999997,
 'pos': 2.8729999999999998}

***Topic modelling:***      (тест)

***first option***

In [11]:
#LDA gensim version-2
import re
from gensim import models, corpora
from nltk import word_tokenize
from nltk.corpus import stopwords
 
NUM_TOPICS = 10
STOPWORDS = stopwords.words('english')
texts = str(data_answers)
#texts = text.split()
    
def clean_text(texts):
    tokenized_text = word_tokenize(texts.lower())
    cleaned_text = [t for t in tokenized_text if t not in STOPWORDS and re.match('[a-zA-Z\-][a-zA-Z\-]{2,}', t)]
    return cleaned_text
 
# For gensim we need to tokenize the data and filter out stopwords
tokenized_data = []
for text in texts:
    tokenized_data.append(clean_text(texts))
 
 
# Build a Dictionary - association word to numeric id
dictionary = corpora.Dictionary(tokenized_data)
 
# Transform the collection of texts to a numerical form
corpus = [dictionary.doc2bow(text) for text in tokenized_data]
 
# Have a look at how the 20th document looks like: [(word_id, count), ...]
print(corpus[20])
# [(12, 3), (14, 1), (21, 1), (25, 5), (30, 2), (31, 5), (33, 1), (42, 1), (43, 2),  ...
 
# Build the LDA model
lda_model = models.LdaModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)
 
# Build the LSI model
lsi_model = models.LsiModel(corpus=corpus, num_topics=NUM_TOPICS, id2word=dictionary)

print("LDA Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lda_model.print_topic(idx, 10))
 
print("=" * 20)
 
print("LSI Model:")
 
for idx in range(NUM_TOPICS):
    # Print the first 10 most representative topics
    print("Topic #%s:" % idx, lsi_model.print_topic(idx, 10))
 
print("=" * 20)

hide_toggle()

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 4), (16, 2), (17, 2), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 2), (42, 1), (43, 1), (44, 1), (45, 3), (46, 1), (47, 1), (48, 1), (49, 1), (50, 7), (51, 1), (52, 1), (53, 2), (54, 1), (55, 1), (56, 3), (57, 1), (58, 1), (59, 1), (60, 2), (61, 1), (62, 4), (63, 1), (64, 1), (65, 1), (66, 1), (67, 1), (68, 1), (69, 1), (70, 1), (71, 2), (72, 1), (73, 1), (74, 2), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 2), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 1), (88, 2), (89, 1), (90, 2), (91, 1), (92, 1), (93, 1), (94, 3), (95, 1), (96, 1), (97, 1), (98, 3), (99, 2), (100, 3), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 2), (109, 1), (110, 1),

***second option***

In [12]:
#LDA scikit-learn version-2

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
 
NUM_TOPICS = 10
 
vectorizer = CountVectorizer(min_df=5, max_df=0.9, 
                             stop_words='english', lowercase=True, 
                             token_pattern='[a-zA-Z\-][a-zA-Z\-]{2,}')
data_vectorized = vectorizer.fit_transform(data_answers)
 
# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
print(lda_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)
print(nmf_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 
# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)
print(lsi_Z.shape)  # (NO_DOCUMENTS, NO_TOPICS)
 

#Let's see how the first document in the corpus looks like in different topic spaces
print(lda_Z[0])
print(nmf_Z[0])
print(lsi_Z[0])


def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])

print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

hide_toggle()

(9011, 10)
(9011, 10)
(9011, 10)
[0.03333333 0.03333333 0.69996635 0.03334243 0.03333913 0.03333965
 0.03333333 0.03333333 0.03334576 0.03333333]
[0.         0.         0.00236966 0.00025967 0.00204367 0.01964022
 0.         0.         0.         0.        ]
[ 0.08661742  0.05041148 -0.01188362 -0.0360447  -0.01982981 -0.04693822
 -0.07996014 -0.00176064 -0.05488262 -0.04362621]
LDA Model:
Topic 0:
[('filter', 746.9667512373255), ('filters', 265.7308945843168), ('believe', 135.03317548291577), ('control', 121.47156533835798), ('samsung', 117.23239881946252), ('according', 110.69609439553089), ('box', 105.1831179646594), ('said', 91.98184090177467), ('compatible', 80.62077922967401), ('motor', 79.27429147440812)]
Topic 1:
[('ice', 531.2417007609596), ('water', 283.26783216732696), ('refrigerator', 261.095153926034), ('fridge', 233.91848745826795), ('kenmore', 229.43442997523476), ('inside', 211.36366129671143), ('freezer', 204.3085074332961), ('make', 202.01610440396672), ('work', 191.5

***LDA visualizing:***

In [13]:
import pyLDAvis.sklearn
 
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda_model, data_vectorized, vectorizer, mds='tsne')
panel

***lda2vec***