In [None]:
# Import the following libraries
import pandas as pd
import numpy as np

# NLTK libraries
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import nltk

# Visualization libraries
import plotly.graph_objects as go
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from PIL import Image # for world cloud image

# Spacy for preprocessing
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_sm')


# To change date to datetime
from datetime import datetime
import re 

from collections import Counter
import string
import scipy.sparse

# Gensim libraries
from gensim import corpora
from gensim.models.ldamulticore import LdaMulticore
import pyLDAvis.gensim_models
from gensim.models import CoherenceModel
from gensim import matutils

# To show all the columns
pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 300)

# to pickle dataframe
import pickle

# Avoid warnings
import warnings
warnings.filterwarnings("ignore")

# Enable logging for gensim - optional but important
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [None]:
##Following code worked for me and I'm using Google Colaboratory.

!pip install pyLDAvis

In [None]:
df = pd.read_csv('/content/gdrive/MyDrive/car_5_brands.csv')

In [None]:
!pip install -U pandas-profiling

In [None]:
df.shape

In [None]:
## Reset the index here ##
df.reset_index(inplace=True, drop=True)

In [None]:
# Drop the unammed column and Author _name column
df.drop(['car_year'],axis=1,inplace=True)
# CHeck the data info
df.info()

In [None]:
df.head()

In [None]:
# Check for nun values
df.isnull().sum()

In [None]:
df.head()

In [None]:
# spliting the Vehicle_title into year, make and model column
df['year'] = df.Vehicle_Title.str.split(' ').apply(lambda x:x[0])
df['make'] = df.Vehicle_Title.str.split(' ').apply(lambda x:x[1])
df['model'] = df.Vehicle_Title.str.split(' ').apply(lambda x:x[2])
df.head()

In [None]:
brand_review_count = brand.groupby('make').count()['review'].reset_index()

In [None]:
## Exploratory Data Analysis ##

# To see the percentage of each brands review in the dataset
df_review_pct = df['brand_name'].value_counts(normalize = True).round(2) * 100 
df_review_pct

In [None]:
# Using plotly to create Barchat
bar_go = go.Bar(x = df_review_count['brand_name'], y = df_review_count['review'], name='Review count')
fig = go.Figure(
    data=[bar_go],
    layout=go.Layout(width=1000, height=600, title='Brand Review Count', xaxis_title= 'Brand Name', yaxis_title='Review count'))
fig.show()


In [None]:
# the count of each brand according to their rating
grouped_brand = df.groupby([df.brand_name, df.Rating]).size().reset_index().rename(columns = {0: 'counts'})
grouped_brand

In [None]:
# Remove the numbers from the review
df['review'] = df['review'].apply(lambda x: re.sub(r'[^A-Za-z\s]', '', x))

# Convert the reviews to lowercase
df['review'] = df['review'].map(lambda x: x.lower())
df.review

In [None]:
 ##Join the review.
long_string = ','.join(list(df.review.values))


In [None]:
## Wordcloud ##
long_string = ','.join(list(df.review.values))
# Import the image of a car to have it as mask
##car_mask = np.array(Image.open("audi_cloud.png"))

# Create a WordCloud object
wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')

# Generate a word cloud
wordcloud.generate(long_string)

plt.figure(figsize= (20,7))
plt.imshow(wordcloud, interpolation = 'bilinear')
plt.axis('off')
plt.title("Common 100 words in reviews", pad = 14, weight = 'bold')

plt.show()

In [None]:
wordcloud.to_file("Car_reviw.png")

In [None]:
import nltk
nltk.download('stopwords')
  

In [None]:
# Setting up the stop-words
stop_words = stopwords.words('english')
# stop_words.extend(['])
stop_words.extend(['porsche,' 'mercede','comfortsport', 'mercedes','mercedes-benz', 'honda','toyota','audi', 'benz','bentley','lexus',
                  'nissan','volvo','drive','nt','like','vehicle','infiniti','good','miles','corvette','come','edmund','lotus','diego','snake',
                 'porsche', 'cayman','bought','year','minute','chicago','car','home', 'work','think','suv','people','edmunds',
                  'cabriolet','lexuss','japan','husband','baby','range', 'rover','cadillac','cadillacs','michelin','texas','second',
                   'awsome','one','now', 'take', 'give', 'new','levinson','road','love','sedan','wife','sport','bang','tank',
                   'truck','lemon','imho','pathfinder','infinity','convertible','allroad','conv','bike','ski','grocery','mclass'
                  ,'hardtop','club','hubby','child','zoom','test','etc','brain','ashamed','carmax','alpina','rocketship','great','germany',
                  'autobahn','mercedez'])

In [None]:
def lematized_review(text): # text
    rev_text = nlp(text)
    # Extract lematized words in lower case format if not digits, not punctuation, not stopword, and length not less than 2
    rev_text = ([token.lemma_.lower() for token in rev_text if not token.is_stop and token.text not in stop_words and not token.is_punct and len(token.text) > 3])
    return rev_text

In [None]:
%%time
# Applying the function on the reviews 

df['review'] = df['review'].apply(lematized_review)

In [None]:
#  Let's pickle it for later use
clean_brand_review = df['review']


In [None]:
%%time
# Create Dictionary
id2word_1 = corpora.Dictionary(clean_brand_review)

# Create Corpus: Term Document Frequency
corpus_1 = [id2word_1.doc2bow(review) for review in clean_brand_review]

 # Build LDA model
ldamodel = LdaMulticore(corpus= corpus_1, num_topics =8, id2word=id2word_1,chunksize=2000, passes=50,per_word_topics=True)

In [None]:
from pprint import pprint

pprint(ldamodel.show_topics(formatted=False))

In [None]:
# Compute Perplexity

#It's a measure of how good the model is. The lower the better. Perplexity is a negative value
print('\nPerplexity: ', ldamodel.log_perplexity(corpus_1))  

# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=ldamodel, texts=clean_brand_review, dictionary=id2word_1, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\n Basic Ldamodel Coherence Score: ', coherence_lda)

Perplexity: -7.703112835107014

Basic Ldamodel Coherence Score: 0.5423936154526896 Notes

perplexity is a measurement of how well a probability distribution or probability model predicts a sample. It may be used to compare probability models. A low perplexity indicates the probability distribution is good at predicting the sample.

The coherence score is used in assessing the quality of the learned topics, the closer to 1 the better

In [None]:
import os
from gensim.models.wrappers import LdaMallet
from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
os.environ['MALLET_HOME'] = '/content/gdrive/MyDrive'

In [None]:
%%time
#  point the path to the mallet path on my computer
mallet_path = '/content/'#insert the path

# Instantiate
ldamallet = LdaMallet(mallet_path,corpus=corpus_1, num_topics=10, id2word=id2word_1)

In [None]:
# Show Topics
from pprint import pprint
pprint(ldamallet.show_topics(formatted=False))

# Compute Coherence Score
coherence_model_ldamallet = CoherenceModel(model=ldamallet, texts=clean_brand_review, dictionary=id2word_1, coherence='c_v')
coherence_ldamallet = coherence_model_ldamallet.get_coherence()
print('\n Mallet Coherence Score: ', coherence_ldamallet)

To find the optimum number of topics 

In [None]:
# Defining a function to get coherence score
def my_coherence_vals(dictionary, corpus, texts, limit, start, step):
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = LdaMallet(mallet_path, corpus=corpus_1, num_topics=num_topics, id2word=id2word_1)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [None]:
# To get the coherence values
model_list, coherence_values = my_coherence_vals(dictionary=id2word_1, corpus=corpus_1, 
                                                 texts=clean_brand_review, start=2, limit=26, step=6)

In [None]:
# Show graph for the coherence value scores vs number of topics
limit=26; start=2; step=6;
topics = range(start, limit, step)
plt.plot(topics, coherence_values)
plt.title("Coherence value score with the number of topics")
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()


In [None]:
# Print the coherence scores
for best, cv in zip(topics, coherence_values):
    print("Topic ", best, " has Coherence Value of", round(cv, 4))

In [None]:
# printing the best topics
optimal_model = model_list[1]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

In [None]:
# printing the best topics
optimal_model = model_list[1]
model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

Document Topic in each review 

In [None]:
# Define the sentence topics
def sentence_topics(ldamodel=ldamodel, corpus=corpus_1, texts=clean_brand_review):
    # Init output
    topics_df = pd.DataFrame()

    # Looping through the documents to find the main topics
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row[0], key=lambda x: (x[1]), reverse=True)
        
        # look for the Dominant topic, % contribution and Keywords 
        for j, (topic_num, prop_topic) in enumerate(row):
            
            # Diplay the dominant topics
            if j == 0:  
                dom = ldamodel.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in dom])
                topics_df = topics_df.append(pd.Series([int(topic_num), round(prop_topic,2)*100, topic_keywords]), ignore_index=True)
            else:
                break
    topics_df.columns = ['Dominant_Review_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Concatenate the text and the topics_df
    contents = pd.Series(texts)
    topics_df = pd.concat([topics_df, contents], axis=1)
    return(topics_df)


df_topic_sents_keywords = sentence_topics(ldamodel=ldamodel, corpus=corpus_1, texts=clean_brand_review)

# Format
dominant_review_topic = df_topic_sents_keywords.reset_index()
dominant_review_topic.columns = ['Review_No', 'Dominant_Review_Topic', 'Percent_contr_per_topic', 'Review_Keywords', 'Original review']

# Show
dominant_review_topic.head(10)

Here I will investigate the percentage of most document in each topic

In [1]:
# The Dataframe
sent_topics_df = pd.DataFrame()

topics_out = df_topic_sents_keywords.groupby('Dominant_Review_Topic')

for i, j in topics_out:
    sent_topics_df = pd.concat([sent_topics_df,j.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], axis=0)

    
sent_topics_df.reset_index(drop=True, inplace=True)

# Format
sent_topics_df.columns = ['Topic_Num', "Percent_contr_per_topic", "Review_Keywords", "Original review"]

# Display the 8 topics
sent_topics_df.head(8)

NameError: name 'pd' is not defined

In [None]:
# saving the review of each brand in a DataFrame
audi_df = ([sent for sent in df.loc[df['make'] == 'Audi', 'review']])
bmw_df = ([sent for sent in df.loc[df['make'] == 'BMW', 'review']])
mercedes_df = ([sent for sent in df.loc[df['make'] == 'Mercedes-Benz', 'review']])
lexus_df = ([sent for sent in df.loc[df['make'] == 'Lexus', 'review']])
inifiniti_df = ([sent for sent in df.loc[df['make'] == 'INFINITI', 'review']])

In [None]:
%%time

# Defining a function to get the topics and visualize them 
def each_brand(text):

    # Create Dictionary
    id2word_2 = corpora.Dictionary(text)

    # Create Corpus: Term Document Frequency
    corpus_2 = [id2word_2.doc2bow(review) for review in text]
    
    # Here I decided to reduce the number of topics to only six for each brand
    model = LdaMulticore(corpus=corpus_2, num_topics = 6, id2word=id2word_2,chunksize=2000, passes=80,per_word_topics=True)
    
    LDAvis_prepared = pyLDAvis.gensim.prepare(model, corpus=corpus_2, dictionary=id2word_2,sort_topics=False)

    return LDAvis_prepared

In [None]:
# Fitting the LDA model on each brand to visual the topics
Audi_lda = each_brand(audi_df)
lexus_lda = each_brand(lexus_df)
bmw_lda = each_brand(bmw_df)
mercedes_lda = each_brand(mercedes_df)
inifiniti_lda = each_brand(inifiniti_df)