# Topic Modelling- News articles related to Toyota

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
%matplotlib inline

In [2]:
directory = os.getcwd()
file = 'news_toyota.pkl'
path = directory + file

In [4]:
news = pd.read_pickle(directory+file)

In [5]:
news.count()

crawled     100
language    100
text        100
title       100
dtype: int64

In [6]:
news.head()

Unnamed: 0,crawled,language,text,title
0,2018-02-02T04:24:51.072+02:00,english,QR Code Link to This Post All maintenance rece...,Dependable truck 03 Toyota Tacoma Double Cab $...
1,2018-02-02T04:27:15.000+02:00,english,0 \nNEW YORK: Automakers reported mixed US car...,US car sales mixed in January; trucks stay strong
2,2018-02-02T04:34:00.008+02:00,english,transmission: automatic 2005 Toyota Camry LE...,2005 TOYOTA CAMRY LE 167300 MILEAGE $2450 (TAL...
3,2018-02-02T04:36:42.006+02:00,english,favorite this post Brand New Toyota Avalon Flo...,Brand New Toyota Avalon Floor Mats (New Britai...
4,2018-02-02T04:38:24.018+02:00,english,more ads by this user QR Code Link to This Pos...,2016 Lexus ES 350 (Coliseum Lexus of Oakland) ...


In [7]:
news.describe(include='all')

Unnamed: 0,crawled,language,text,title
count,100,100,100,100
unique,100,1,96,77
top,2018-02-02T08:42:27.000+02:00,english,"Asian shares lower as investors mull earnings,...","Asian shares lower as investors mull earnings,..."
freq,1,100,4,21


In [9]:
#of english articles
len(news[news['language']=='english'])

100

In [10]:
#some english text
news[news['language']=='english']['text']

0     QR Code Link to This Post All maintenance rece...
1     0 \nNEW YORK: Automakers reported mixed US car...
2     transmission: automatic   2005 Toyota Camry LE...
3     favorite this post Brand New Toyota Avalon Flo...
4     more ads by this user QR Code Link to This Pos...
                            ...                        
95    Solid US jobs figures leave downbeat markets u...
96    Hybrids – Vehicles, Battery & Hydraulic Techno...
97      Toyota Recalls Certain Prius and Lexus RX, N...
98      Toyota Spotlights Emerging and Award-Winning...
99      New Toyota Off-Road Rigs, All-New Avalon and...
Name: text, Length: 100, dtype: object

In [11]:
#filter non english text
news=news[news['language']=='english']

In [12]:
len(news)

100

In [13]:
#create a new column wit clean text (remove special characters)
news['text_clean'] = news['text'].map(lambda x: re.sub('[^a-zA-Z0-9 @ . , : - _]', '', str(x)))

In [14]:
pd.set_option('display.max_colwidth', 100)
news[['text', 'text_clean']].head(5)

Unnamed: 0,text,text_clean
0,"QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra...","QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra..."
1,"0 \nNEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and...","0 NEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and p..."
2,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...
3,favorite this post Brand New Toyota Avalon Floor Mats - $115 (New Britain) hide this posting unh...,favorite this post Brand New Toyota Avalon Floor Mats 115 New Britain hide this posting unhide ...
4,more ads by this user QR Code Link to This Post Black w/Piano Black w/Perforated NuLuxe Seat Tri...,more ads by this user QR Code Link to This Post Black wPiano Black wPerforated NuLuxe Seat Trim....


In [15]:
news['length']=news['text_clean'].apply(len)

In [16]:
news[['text','text_clean','length']].head()

Unnamed: 0,text,text_clean,length
0,"QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra...","QR Code Link to This Post All maintenance receipts available, one owner truck. Cash sale. No tra...",113
1,"0 \nNEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and...","0 NEW YORK: Automakers reported mixed US car sales in January, with strong demand for SUVs and p...",2598
2,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...,transmission: automatic 2005 Toyota Camry LE 4 door 4 cyl AUTOMATIC VERY CLEAN INSIDE CLOTH IN...,213
3,favorite this post Brand New Toyota Avalon Floor Mats - $115 (New Britain) hide this posting unh...,favorite this post Brand New Toyota Avalon Floor Mats 115 New Britain hide this posting unhide ...,498
4,more ads by this user QR Code Link to This Post Black w/Piano Black w/Perforated NuLuxe Seat Tri...,more ads by this user QR Code Link to This Post Black wPiano Black wPerforated NuLuxe Seat Trim....,556


In [17]:
news['length'].describe()

count      100.000000
mean      2300.490000
std       1980.981756
min        113.000000
25%        678.000000
50%       2265.000000
75%       2988.500000
max      11508.000000
Name: length, dtype: float64

News articles are mostly 2300 characters long- these are the news articles without special characters

# Clean the articles further

Remove stopwords, punctuations and lemmatize the text for further analysis

In [18]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()

In [19]:
#function to normalize the text
def clean_text(text):
    no_punc_text=[i for i in text if i not in string.punctuation] #remove punctuations
    no_punc_text=''.join(no_punc_text)
    no_stop_text=[i for i in no_punc_text.split() if i.lower() not in stopwords.words('english')] #remove stopwords
    #no_stop_text=' '.join(no_punc_text)
    normalized =  ' '.join(lemma.lemmatize(word) for word in no_stop_text)#lemmatize
    return normalized.split()

In [20]:
#apply function to news to test the funtion
news_clean = clean_text(news.text_clean)

In [21]:
news_clean

['QR',
 'Code',
 'Link',
 'Post',
 'maintenance',
 'receipt',
 'available,',
 'one',
 'owner',
 'truck.',
 'Cash',
 'sale.',
 'trades.',
 '64774780130',
 'NEW',
 'YORK:',
 'Automakers',
 'reported',
 'mixed',
 'US',
 'car',
 'sale',
 'January,',
 'strong',
 'demand',
 'SUVs',
 'pickup',
 'truck',
 'continuing',
 'provide',
 'cushion',
 'declining',
 'overall',
 'auto',
 'market.',
 'Ford',
 'Fiat',
 'Chrysler',
 'reported',
 'decline',
 'yearoveryear',
 'sales,',
 'General',
 'Motors',
 'scored',
 'modest',
 'increase',
 'Toyota',
 'saw',
 'substantial',
 'jump.',
 'US',
 'car',
 'sale',
 'fell',
 'last',
 'year',
 'first',
 'time',
 'since',
 'financial',
 'crisis',
 'projected',
 'decline',
 '2018.',
 'Still,',
 'analyst',
 'industry',
 'executive',
 'expect',
 'US',
 'sale',
 'year',
 'come',
 'solid',
 '16',
 'million',
 'vehicle',
 'amid',
 'low',
 'unemployment',
 'strong',
 'consumer',
 'confidence.',
 'US',
 'economic',
 'factor',
 'healthy',
 'seeing',
 'effect',
 'auto',
 'in

The output of the function is a set of tokens which can be used for further analysis

# Process data for LDA models

Convert tokens obtained in the previous step to bag of words

In [22]:
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
bow= CountVectorizer(analyzer=clean_text).fit(news.text_clean) #creatign a bag of words using the function to clean the text

In [24]:
dataBOW=bow.transform(news.text_clean)

In [25]:
dataBOW.shape

(100, 5555)

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer

In [27]:
tfidf=TfidfTransformer().fit(dataBOW) #converting bag of words to tf-idf

In [28]:
dataText_tfidf=tfidf.transform(dataBOW)

In [29]:
print(dataText_tfidf.shape)

(100, 5555)


In [39]:
#check some feature names
bow.get_feature_names()[5507]

'world'

# Topic Modelling

Using LDA to generate topics

In [40]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.model_selection import GridSearchCV
from pprint import pprint

# Plotting tools
import pyLDAvis
import pyLDAvis.sklearn

  from collections import Mapping


Using grid search to obtain optimal parameters for the LDA model

In [41]:
search_params = {'n_components': [5,7,10], 'learning_decay': [.5, .7, .9]}

In [42]:
lda = LatentDirichletAllocation() #initialize lda model with default params

In [43]:
model_bow = GridSearchCV(lda, param_grid=search_params) #grid search to search for optimal param

__Find the best model using grid search on bag of words__

In [44]:
model_bow.fit(dataBOW) #fit lda model on bag of words 



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [45]:
#best lda model using bag of words
best_lda_model_bow = model_bow.best_estimator_

In [46]:
#best lda model's params and scores
print("Best Model's Params: ", model_bow.best_params_)
print("Best Log Likelihood Score: ", model_bow.best_score_)
print("Model Perplexity: ", best_lda_model_bow.perplexity(dataBOW))

Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Best Log Likelihood Score:  -80783.40502091534
Model Perplexity:  1877.6482419780646


__Find the best model using grid search on tf-idf matrix__

In [47]:
model_tfidf = GridSearchCV(lda, param_grid=search_params) #grid search to search for optimal param
model_tfidf.fit(dataText_tfidf) 



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=LatentDirichletAllocation(batch_size=128,
                                                 doc_topic_prior=None,
                                                 evaluate_every=-1,
                                                 learning_decay=0.7,
                                                 learning_method='batch',
                                                 learning_offset=10.0,
                                                 max_doc_update_iter=100,
                                                 max_iter=10,
                                                 mean_change_tol=0.001,
                                                 n_components=10, n_jobs=None,
                                                 perp_tol=0.1,
                                                 random_state=None,
                                                 topic_word_prior=None,
                                   

In [48]:
#best lda model using bag of words
best_lda_model_tfidf = model_tfidf.best_estimator_

__Comparing the best models form bag of words and tf-idf__

In [49]:
#best lda model's params and scores- Bag of words and tfidf
print("Bag of Words: Best Model's Params: ", model_bow.best_params_)
print("Bag of Words: Best Log Likelihood Score: ", model_bow.best_score_)
print("Bag of Words: Model Perplexity: ", best_lda_model_bow.perplexity(dataBOW))

print("TFIDF: Best Model's Params: ", model_tfidf.best_params_)
print("TFIDF: Best Log Likelihood Score: ", model_tfidf.best_score_)
print("TFIDF: Model Perplexity: ", best_lda_model_tfidf.perplexity(dataBOW))

Bag of Words: Best Model's Params:  {'learning_decay': 0.5, 'n_components': 5}
Bag of Words: Best Log Likelihood Score:  -80783.40502091534
Bag of Words: Model Perplexity:  1877.6482419780646
TFIDF: Best Model's Params:  {'learning_decay': 0.9, 'n_components': 5}
TFIDF: Best Log Likelihood Score:  -4208.25849088492
TFIDF: Model Perplexity:  10856.79594974383


From the above metrics, it looks like the model with bag of words has performed better (lower perplexity and log likelihood scores)

# Build LDA models using the best params identified above

__Bag of words model__

In [51]:
lda_bow=best_lda_model_bow.transform(dataBOW)

Identify dominant topic across documents

In [52]:
#create a dataframe to identify the dominat topic across documents
topicnames = ["Topic" + str(i) for i in range(best_lda_model_bow.n_components)] #create column names
docnames = ["Doc" + str(i) for i in range(len(news))]#create indices
df_document_topic = pd.DataFrame(np.round(lda_bow, 2), columns=topicnames, index=docnames)#obtain scores for each topic from the lda model
dominant_topic = np.argmax(df_document_topic.values, axis=1) #identify the dominat score/ topic
df_document_topic['dominant_topic'] = dominant_topic
df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.95,0.01,0.01,0.01,0.01,0
Doc1,1.00,0.00,0.00,0.00,0.00,0
Doc2,0.97,0.01,0.01,0.01,0.01,0
Doc3,0.00,0.00,0.99,0.00,0.00,2
Doc4,0.14,0.00,0.00,0.85,0.00,3
...,...,...,...,...,...,...
Doc95,0.00,0.00,1.00,0.00,0.00,2
Doc96,0.99,0.00,0.00,0.00,0.00,0
Doc97,0.99,0.00,0.00,0.00,0.00,0
Doc98,0.99,0.00,0.00,0.00,0.00,0


In [53]:
#identify the most frequently occuring topic
df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
df_topic_distribution.columns = ['Topic Num', 'Num Documents']
df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,0,35
1,4,27
2,2,16
3,3,13
4,1,9


Keywords in a topic

In [54]:
#dataframe of scores for each word in a topic
df_topic_keywords = pd.DataFrame(best_lda_model_bow.components_)
df_topic_keywords.columns = bow.get_feature_names()
df_topic_keywords.index = topicnames

#function to get top few words in a topic
def show_topics(vectorizer=bow, lda_model=best_lda_model_bow, n_words=20):
    keywords = np.array(bow.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

#dataframe with topmost keywords in a topic
topic_keywords = show_topics(vectorizer=bow, lda_model=best_lda_model_bow, n_words=15)
df_topic_keywords = pd.DataFrame(topic_keywords)
df_topic_keywords.columns = ['Word '+str(i) for i in range(df_topic_keywords.shape[1])]
df_topic_keywords.index = ['Topic '+str(i) for i in range(df_topic_keywords.shape[0])]
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,Toyota,sale,vehicle,percent,year,car,January,2018,industry,Lexus,last,month,said,per,one
Topic 1,unit,Ford,market,Toyota,Fords,car,new,January,Japan,margin,DS,share,said,2017,vehicle
Topic 2,percent,US,fell,market,stock,lower,yield,close,investor,future,price,contact,benchmark,04,Toyota
Topic 3,percent,car,Toyota,1,losing,ball,market,2018,player,point,release,declining,also,Index,2019
Topic 4,US,percent,earnings,yield,per,index,cent,share,benchmark,lower,01,Friday,latest,report,said


__Interpretation__
1. Topic 0: Sale of Lexus in January 2018
2. Topic 1: Shares and Margin
3. Topic 2: Stocks fell
4. Topic 3: Declining Market
5. Topic 4: Earnings report

__TF-IDF model__

In [55]:
lda_tfidf=best_lda_model_tfidf.transform(dataText_tfidf) 

Identify dominant topic across documents

In [56]:
#create a dataframe to identify the dominat topic across documents
tfidf_topicnames = ["Topic" + str(i) for i in range(best_lda_model_tfidf.n_components)] #create column names
tfidf_docnames = ["Doc" + str(i) for i in range(len(news))]#create indices
tfidf_df_document_topic = pd.DataFrame(np.round(lda_tfidf, 2), columns=topicnames, index=docnames)#obtain scores for each topic from the lda model
tfidf_dominant_topic = np.argmax(tfidf_df_document_topic.values, axis=1) #identify the dominat score/ topic
tfidf_df_document_topic['dominant_topic'] = tfidf_dominant_topic
tfidf_df_document_topic

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,dominant_topic
Doc0,0.82,0.04,0.04,0.04,0.04,0
Doc1,0.02,0.93,0.02,0.02,0.02,1
Doc2,0.03,0.03,0.87,0.03,0.03,2
Doc3,0.89,0.03,0.03,0.03,0.03,0
Doc4,0.02,0.02,0.02,0.90,0.02,3
...,...,...,...,...,...,...
Doc95,0.01,0.95,0.01,0.01,0.01,1
Doc96,0.02,0.02,0.02,0.90,0.02,3
Doc97,0.02,0.08,0.02,0.02,0.85,4
Doc98,0.02,0.03,0.02,0.02,0.90,4


In [57]:
#identify the most frequently occuring topic
tfidf_df_topic_distribution = tfidf_df_document_topic['dominant_topic'].value_counts().reset_index(name="Num Documents")
tfidf_df_topic_distribution.columns = ['Topic Num', 'Num Documents']
tfidf_df_topic_distribution

Unnamed: 0,Topic Num,Num Documents
0,1,38
1,4,21
2,3,17
3,0,13
4,2,11


Keywords in a topic

In [58]:
#dataframe of scores for each word in a topic
tfidf_df_topic_keywords = pd.DataFrame(best_lda_model_bow.components_)
tfidf_df_topic_keywords.columns = bow.get_feature_names()
tfidf_df_topic_keywords.index = topicnames

#function to get top few words in a topic
def show_topics(vectorizer=bow, lda_model=best_lda_model_tfidf, n_words=20):
    keywords = np.array(bow.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

#dataframe with topmost keywords in a topic
tfidf_topic_keywords = show_topics(vectorizer=bow, lda_model=best_lda_model_tfidf, n_words=15)
tfidf_df_topic_keywords = pd.DataFrame(tfidf_topic_keywords)
tfidf_df_topic_keywords.columns = ['Word '+str(i) for i in range(tfidf_df_topic_keywords.shape[1])]
tfidf_df_topic_keywords.index = ['Topic '+str(i) for i in range(tfidf_df_topic_keywords.shape[0])]
tfidf_df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,unit,market,Toyota,vehicle,registration,followed,sale,truck,January,Link,Code,QR,Esso,Post,Tacoma
Topic 1,percent,US,yield,earnings,index,cent,per,lower,benchmark,share,01,Friday,rose,report,latest
Topic 2,contact,Canada,Toyota,post,car,unsolicited,id,vehicle,offer,Camry,transmission,service,info,Post,Car
Topic 3,air,bag,Lexus,Toyota,car,March,owner,customer,could,vehicle,model,stop,electrical,new,recall
Topic 4,contact,offer,unsolicited,post,id,service,Toyota,car,QR,Link,Code,Post,vehicle,export,4


__Interpretation of topics__
1. Topic 0: Sale of vehicles
2. Topic 1: Earnings
3. Topic 2: Toyota Camry in Canada
4. Topic 3: Airbag recall
5. Topic 4: Not clearly interpretable

# Final Model-Interpretation of topics

In [59]:
#topics from bag of words model
news['bow_dominant_topic']=df_document_topic.dominant_topic.tolist()

#topics from tf-idf model
news['tfidf_dominant_topic']=tfidf_df_document_topic.dominant_topic.tolist()

__Bag of Words Model__

In [66]:
df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,Toyota,sale,vehicle,percent,year,car,January,2018,industry,Lexus,last,month,said,per,one
Topic 1,unit,Ford,market,Toyota,Fords,car,new,January,Japan,margin,DS,share,said,2017,vehicle
Topic 2,percent,US,fell,market,stock,lower,yield,close,investor,future,price,contact,benchmark,04,Toyota
Topic 3,percent,car,Toyota,1,losing,ball,market,2018,player,point,release,declining,also,Index,2019
Topic 4,US,percent,earnings,yield,per,index,cent,share,benchmark,lower,01,Friday,latest,report,said


__Visualize the topic model obtained from bag of words__

In [61]:
pyLDAvis.enable_notebook()
bow_vis = pyLDAvis.sklearn.prepare(best_lda_model_bow, dataBOW, bow, mds='tsne')
bow_vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


The topics from bag of words can be interpreted as follows
1. Sale of Lexus in January 2018
2. Shares and Margin
3. Stocks fell
4. Declining Market
5. Earnings report

__TF_IDF Model__

In [68]:
tfidf_df_topic_keywords

Unnamed: 0,Word 0,Word 1,Word 2,Word 3,Word 4,Word 5,Word 6,Word 7,Word 8,Word 9,Word 10,Word 11,Word 12,Word 13,Word 14
Topic 0,unit,market,Toyota,vehicle,registration,followed,sale,truck,January,Link,Code,QR,Esso,Post,Tacoma
Topic 1,percent,US,yield,earnings,index,cent,per,lower,benchmark,share,01,Friday,rose,report,latest
Topic 2,contact,Canada,Toyota,post,car,unsolicited,id,vehicle,offer,Camry,transmission,service,info,Post,Car
Topic 3,air,bag,Lexus,Toyota,car,March,owner,customer,could,vehicle,model,stop,electrical,new,recall
Topic 4,contact,offer,unsolicited,post,id,service,Toyota,car,QR,Link,Code,Post,vehicle,export,4


__Visualize the topic model obtained from tf-idf__

In [62]:
pyLDAvis.enable_notebook()
bow_vis_tfidf = pyLDAvis.sklearn.prepare(best_lda_model_tfidf, dataText_tfidf, bow, mds='tsne')
bow_vis_tfidf

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


The topics from the tf-idf model can be interpreted as follows
1. Sale of vehicles
2. Earnings
3. Toyota Camry in Canada
4. Airbag recall
5. Not clearly interpretable

Here, bag of words performed better over TF-IDF- both scores and interpretability of the topics was better with with bag of words model.