# Mobile Phone Reviews Analysis - Text Mining

## Importing data 

In [1]:
import pandas as pd
import os
import datetime
import nltk 
import re
import numpy as np
import gensim
import pickle
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

#Note: You will need to install the packages below to use them
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer



In [2]:
#Just write your directory here
#data_dir = 'C:/Users/owzhe/Downloads/14-million-cell-phone-reviews/'
data_dir = 'C:/Users/Eugene/Desktop/Modules/Text Mining and Language Processing (IS450)/Project/Potential datasets/14-million-cell-phone-reviews/'
reviews_files = os.listdir(data_dir)

reviews_df = pd.DataFrame()

for file in reviews_files:
    current = pd.read_csv(data_dir + file, encoding = "ISO-8859-1")
    reviews_df = reviews_df.append(current)
    

In [3]:
print(len(reviews_df))
reviews_df.head()

1415133


Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


## EDA 

In [4]:
# Taking only the reviews that are in english
reviews_en_df = reviews_df[reviews_df['lang'] == 'en']
print(len(reviews_en_df))
reviews_en_df.head()

554746


Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl..."
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T)
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8


In [5]:
print(len(reviews_en_df['phone_url'].unique()))
print(reviews_en_df['phone_url'].unique())

4533
['/cellphones/samsung-galaxy-s8/'
 '/cellphones/samsung-galaxy-s6-edgeplus/'
 '/cellphones/samsung-galaxy-s8-plus/' ... '/cellphones/ericsson-pf-768/'
 '/cellphones/motorola-m3288/' '/cellphones/maxon-mx-3204/']


In [6]:
# Creating the function to return the phone model from the URL
def phone_model(url):
    phone_type = ' '.join(url[12:-1].split('-')).title()
    return phone_type

In [7]:
# Applying the phone_model function on the url 
reviews_en_df['phone_model'] = reviews_en_df['phone_url'].apply(phone_model)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [8]:
print(len(reviews_en_df['phone_model'].unique()))

4533


In [9]:
# Creating another copy of the reviews_df
reviews_copy_df = reviews_en_df.copy()

In [10]:
# Found out that the dates are abit off - majority are in month/day/year but there is around 20k that is day/month/year
def check_date(row_date):
    month, day, year = row_date.split('/')
    isValid = True
    try:
        datetime.datetime(int(year), int(month), int(day))
    except:
        isValid = False
    
    return isValid

In [11]:
# Creating a year column
def review_year(row_date):
    month, day, year = row_date.split('/')

    return year

In [12]:
reviews_copy_df['date_check'] = reviews_copy_df['date'].apply(check_date)
reviews_copy_df['year'] = reviews_copy_df['date'].apply(review_year)
reviews_copy_df.head()

Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product,phone_model,date_check,year
0,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8,Samsung Galaxy S8,True,2017
1,/cellphones/samsung-galaxy-s8/,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8,Samsung Galaxy S8,True,2017
2,/cellphones/samsung-galaxy-s8/,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",Samsung Galaxy S8,True,2017
3,/cellphones/samsung-galaxy-s8/,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T),Samsung Galaxy S8,True,2017
4,/cellphones/samsung-galaxy-s8/,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8,Samsung Galaxy S8,True,2017


In [13]:
# Checking the number of wrong and correct dates 
wrong_dates = reviews_copy_df[reviews_copy_df['date_check'] == False]
print('No. of wrong dates:', len(wrong_dates))

correct_dates = reviews_copy_df[reviews_copy_df['date_check'] == True]
print('No. of correct dates:', len(correct_dates))

No. of wrong dates: 23448
No. of correct dates: 531298


In [14]:
# Created a function to clean the date for the incorrect date format
def clean_date(date):
    month, day, year = date.split('/')
    return '/'.join([day, month, year])

#     for index, row in reviews_df.iterrows():
#         month, day, year = row['date'].split('/')
#         if row['date_check'] == False:
#             reviews_df.loc[index, 'date'] = '/'.join([day, month, year])
#             row['date_check'] = True
#     return reviews_df

In [15]:
# Clean up the date format of the dataframe with wrong dates
wrong_dates['date'] = wrong_dates['date'].apply(clean_date)
wrong_dates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,phone_url,date,lang,country,source,domain,score,score_max,extract,author,product,phone_model,date_check,year
1,/cellphones/samsung-s7262-duos-galaxy-ace/,11/17/2015,en,in,Zopper,zopper.com,10.0,10.0,Decent Functions and Easy to Operate Pros:- Th...,Expert Review,Samsung Galaxy Star Pro S7262 Black,Samsung S7262 Duos Galaxy Ace,False,2015
2,/cellphones/samsung-s7262-duos-galaxy-ace/,10/29/2015,en,in,Amazon,amazon.in,4.0,10.0,Not Good Phone such price. Hang too much and v...,Amazon Customer,Samsung Galaxy Star Pro GT-S7262 (Midnight Black),Samsung S7262 Duos Galaxy Ace,False,2015
3,/cellphones/samsung-s7262-duos-galaxy-ace/,10/29/2015,en,in,Amazon,amazon.in,6.0,10.0,not bad for features,Amazon Customer,Samsung Galaxy Star Pro GT-S7262 (Midnight Black),Samsung S7262 Duos Galaxy Ace,False,2015
4,/cellphones/samsung-s7262-duos-galaxy-ace/,10/29/2015,en,in,Amazon,amazon.in,10.0,10.0,Excellent product,NHK,Samsung Galaxy Star Pro GT-S7262 (Midnight Black),Samsung S7262 Duos Galaxy Ace,False,2015
5,/cellphones/samsung-s7262-duos-galaxy-ace/,10/27/2015,en,in,Amazon,amazon.in,8.0,10.0,Good in reasonable price,Rupali,Samsung Galaxy Star Pro GT-S7262 (Midnight Black),Samsung S7262 Duos Galaxy Ace,False,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97303,/cellphones/motorola-razr-432872/,5/13/2006,en,gb,Ciao,ciao.co.uk,8.0,10.0,I bought this phone originally because I decid...,tblake7,Motorola RAZR,Motorola Razr 432872,False,2006
97304,/cellphones/motorola-razr-432872/,3/18/2006,en,gb,Ciao,ciao.co.uk,8.0,10.0,"After making the switch from an old V60 TDMA, ...",sting_of_the_scorpion,Motorola RAZR,Motorola Razr 432872,False,2006
97906,/cellphones/alcatel-ot-918d/,1/30/2015,en,us,Amazon,amazon.com,10.0,10.0,Excelente producto,norbys moreno,Alcatel One Touch 918S Mix Unlocked GSM Phone ...,Alcatel Ot 918D,False,2015
97910,/cellphones/alcatel-ot-918d/,11/25/2013,en,us,Amazon,amazon.com,4.0,10.0,"Older Android version, too many junky apps and...","Amazon Customer ""Jarod""",Alcatel One Touch 918S Mix Unlocked GSM Phone ...,Alcatel Ot 918D,False,2013


In [16]:
# Returning the cleaned dataframe use for analysis 
cleaned_df = correct_dates.append(wrong_dates, ignore_index=True)
cleaned_df = cleaned_df[cleaned_df['year'].astype(int) >= 2013]
cleaned_df.drop(columns=['phone_url', 'date_check', 'year'], inplace=True)

In [17]:
# Checking null rows
null_columns = cleaned_df.columns[cleaned_df.isna().any()]
cleaned_df[null_columns].isna().sum()

score         158
score_max     158
extract      3448
author        305
dtype: int64

In [18]:
cleaned_df.dropna(subset=['extract'], inplace=True)
len(cleaned_df)

400063

In [19]:
#Removing pre-2007 reviews
# date_list = cleaned_df.loc[:,'date'].tolist()
# year_list = []
# for dates in date_list:
#     year = dates[-4:]
#     if (int(year)>=2007):
#         year_list.append(year)
#     else:
#         year_list.append("drop")
# cleaned_df.insert(0, 'year', year_list)
# cleaned_df.drop( cleaned_df[ cleaned_df['year'] == "drop" ].index , inplace=True)
# cleaned_df.drop(['year'], axis=1, inplace = True)
# len(cleaned_df)

## Storing/Opening pickle file for the cleaned dataframe 

In [20]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE CLEANED DATAFRAME FROM ABOVE @@@@@@@@@###
#pickle.dump(cleaned_df, open("cleaned_reviews_df.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE CLEANED DATAFRAME FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
#cleaned_df = pickle.load(open("cleaned_reviews_df.pkl", "rb"))

In [21]:
cleaned_df.head()

Unnamed: 0,date,lang,country,source,domain,score,score_max,extract,author,product,phone_model
0,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8,Samsung Galaxy S8
1,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8,Samsung Galaxy S8
2,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",Samsung Galaxy S8
3,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T),Samsung Galaxy S8
4,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8,Samsung Galaxy S8


In [22]:
## Create an index series to find back the actual extract later on
import numpy as np
import pandas as pd

data_len = len(cleaned_df)
index_series = np.arange(data_len)

cleaned_df['index'] = index_series

cleaned_df

Unnamed: 0,date,lang,country,source,domain,score,score_max,extract,author,product,phone_model,index
0,5/2/2017,en,us,Verizon Wireless,verizonwireless.com,10.0,10.0,As a diehard Samsung fan who has had every Sam...,CarolAnn35,Samsung Galaxy S8,Samsung Galaxy S8,0
1,4/28/2017,en,us,Phone Arena,phonearena.com,10.0,10.0,Love the phone. the phone is sleek and smooth ...,james0923,Samsung Galaxy S8,Samsung Galaxy S8,1
2,5/4/2017,en,us,Amazon,amazon.com,6.0,10.0,Adequate feel. Nice heft. Processor's still sl...,R. Craig,"Samsung Galaxy S8 (64GB) G950U 5.8"" 4G LTE Unl...",Samsung Galaxy S8,2
3,5/2/2017,en,us,Samsung,samsung.com,9.2,10.0,Never disappointed. One of the reasons I've be...,Buster2020,Samsung Galaxy S8 64GB (AT&T),Samsung Galaxy S8,3
4,5/11/2017,en,us,Verizon Wireless,verizonwireless.com,4.0,10.0,I've now found that i'm in a group of people t...,S Ate Mine,Samsung Galaxy S8,Samsung Galaxy S8,4
...,...,...,...,...,...,...,...,...,...,...,...,...
554608,3/18/2014,en,us,Amazon,amazon.com,8.0,10.0,I love this phone! The only problem I have is ...,"Brandi ""Brandi""",Motorola Droid RAZR 4G LTE Android Smartphone ...,Motorola Razr 432872,400058
554609,1/21/2014,en,us,Amazon,amazon.com,10.0,10.0,I love the 4G internet. I like all of the appl...,Amazon Customer,Motorola Droid RAZR 4G LTE Android Smartphone ...,Motorola Razr 432872,400059
554610,8/30/2013,en,us,Amazon,amazon.com,10.0,10.0,This phone works great. Is in good condition. ...,roshanda,Motorola Droid RAZR 4G LTE Android Smartphone ...,Motorola Razr 432872,400060
554743,1/30/2015,en,us,Amazon,amazon.com,10.0,10.0,Excelente producto,norbys moreno,Alcatel One Touch 918S Mix Unlocked GSM Phone ...,Alcatel Ot 918D,400061


In [23]:
sentence_list = []
index_list = []
scores = cleaned_df['score'].tolist()
index_score_list = []

extract_list = list(cleaned_df['extract'])

for num in index_series:
    sentences = extract_list[num]
    sentence_break = re.split(r'[.?!]', sentences)
    for sentence in sentence_break:
        if len(sentence) > 0:
            sentence_list.append(sentence.strip())
            index_list.append(num)
            index_score_list.append(scores[num])

In [24]:
sentences_df = pd.DataFrame()
sentences_df['sentence'] = sentence_list
sentences_df['index'] = index_list

sentences_df.dropna(inplace=True)

sentences_df.shape

(1025939, 2)

#  #1 LDA Gensim Model - All Text 

In [None]:
import nltk 
import re

def clean_sentence(sentence_list):
    stop_list = nltk.corpus.stopwords.words('english')
    tokenizer = nltk.tokenize.word_tokenize

    sentence_list  = [tokenizer(sentence.lower()) for sentence in sentence_list]
    sentence_list = [[w for w in sentence if re.search('^[a-z]+$',w)] for sentence in sentence_list]
    sentence_list = [[w for w in sentence if w not in stop_list] for sentence in sentence_list]

    return sentence_list

In [None]:
sentence_list = sentences_df['sentence'].values.tolist()
sentences_df['bag of words'] = clean_sentence(sentence_list)

In [None]:
sentences_words = sentences_df['bag of words'].values.tolist()

In [None]:
import gensim

# Using gensim to create a dictionary object of all the words in all extracts
sentence_dict = gensim.corpora.Dictionary(sentences_words)

# Returning the word vector for each extract from the gensim dict of words
sentence_vecs = [sentence_dict.doc2bow(words) for words in sentences_words]

In [None]:
# Finding the optimal number of topics
from gensim.models import CoherenceModel

iterations = 100
passes = 1
gensim_all_texts_list = []

for num_topic in range(3, 10):
    model_val = []
    gensim_all_texts = gensim.models.ldamodel.LdaModel(corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic, iterations = iterations, passes = passes)
    coh_model = CoherenceModel(model=gensim_all_texts, texts=sentences_words, dictionary=sentence_dict, coherence='c_v')
    model_val.append(gensim_all_texts)
    model_val.append(coh_model.get_coherence())
    model_val.append(num_topic)
    gensim_all_texts_list.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

coh_val = [coh for model, coh, topic in gensim_all_texts_list]

x = range(3, 10)
plt.plot(x, coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Gensim LDA Model - topics

for model in gensim_all_texts_list:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE GENSIM MODELS FROM ABOVE @@@@@@@@@###
#pickle.dump(gensim_all_texts_list, open("gensim_all_texts.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE GENSIM MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# gensim_all_texts_list = pickle.load(open("gensim_all_texts.pkl", "rb"))

# #1 LDA Mallet Model - All Text

In [None]:
import gensim
import os 

os.environ.update({'MALLET_HOME':r'D:\\Softwares\\mallet-2.0.8'})
mallet_path = r'D:\\Softwares\\mallet-2.0.8\\bin\\mallet'

mallet_all_texts_list = []

for num_topic in range(3, 10):
    model_val = []
    mallet_all_texts = gensim.models.wrappers.LdaMallet(mallet_path, corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic)
    mallet_coh_model = CoherenceModel(model=mallet_all_texts, texts=sentences_words, dictionary=sentence_dict, coherence='c_v')
    model_val.append(mallet_all_texts)
    model_val.append(mallet_coh_model.get_coherence())
    model_val.append(num_topic)
    mallet_all_texts_list.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(mallet_coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

mallet_coh_val = [coh for model, coh, topic in mallet_all_texts_list]

x = range(3, 10)
plt.plot(x, mallet_coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics

for model in mallet_all_texts_list:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(mallet_all_texts_list, open("mallet_all_texts.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_all_texts_list = pickle.load(open("mallet_all_texts.pkl", "rb"))

# #2 LDA Gensim Model - Nouns Only

In [None]:
sentences_df = pd.DataFrame()
sentences_df['sentence'] = sentence_list
sentences_df['index'] = index_list
sentences_df.dropna(inplace=True)

sentences_df.head()

In [None]:
import nltk
nltk.download('averaged_perceptron_tagger')

In [None]:
import nltk

stop_list = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.word_tokenize
extra_stop_words = ['phone', 'iphone', 'nokia', 'samsung', 'htc', 'lg', 'galaxy', 'ca', 'motorola', 'android', 'verizon', 'i', '%', 't', 's']

def get_nouns(sentence_row):

    noun_list = []
    sentence = nltk.pos_tag(tokenizer(sentence_row.lower()))
    for w, pos in sentence: 
        if pos == 'NN' and w not in extra_stop_words:
            noun_list.append(w)
    
    return noun_list

#     sentence_list = [tokenizer(sentence.lower()) for sentence in sentence_list]
#     sentence_tagging = [nltk.pos_tag(sentence) for sentence in sentence_list] 
#     sentence_nouns = [w for w in sentence if is_noun(w) for sentence in sentence_list]

In [None]:
sentences_df['sentence_tags'] = sentences_df['sentence'].apply(get_nouns)

In [None]:
sentences_df = sentences_df[sentences_df['sentence_tags'].str.len()>0]

In [None]:
sentences_df.head()

In [None]:
sentence_tags = sentences_df['sentence_tags'].values.tolist()

In [None]:
import gensim

# Using gensim to create a dictionary object of all the words in all extracts
sentence_dict = gensim.corpora.Dictionary(sentence_tags)

# Returning the word vector for each extract from the gensim dict of words
sentence_vecs = [sentence_dict.doc2bow(words) for words in sentence_tags]

In [None]:
# Finding the optimal number of topics
from gensim.models import CoherenceModel

gensim_noun_models = []

for num_topic in range(3, 10):
    model_val = []
    gensim_nouns = gensim.models.ldamodel.LdaModel(corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic)
    coh_model = CoherenceModel(model=gensim_nouns, texts=sentence_tags, dictionary=sentence_dict, coherence='c_v')
    model_val.append(gensim_nouns)
    model_val.append(coh_model.get_coherence())
    model_val.append(num_topic)
    gensim_noun_models.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

coh_val = [coh for model, coh, topic in gensim_noun_models]

x = range(3, 10)
plt.plot(x, coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
for model in gensim_noun_models:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model[0].show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(gensim_noun_models, open("gensim_noun_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# gensim_noun_models = pickle.load(open("gensim_noun_models.pkl", "rb"))

# #2 LDA Mallet Model - Nouns Only

In [None]:
import gensim
import os 

os.environ.update({'MALLET_HOME':r'D:\\Softwares\\mallet-2.0.8'})
mallet_path = r'D:\\Softwares\\mallet-2.0.8\\bin\\mallet'

mallet_noun_models = []

for num_topic in range(3, 10):
    model_val = []
    mallet_nouns_only = gensim.models.wrappers.LdaMallet(mallet_path, corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic)
    mallet_coh_model = CoherenceModel(model=mallet_nouns_only, texts=sentence_tags, dictionary=sentence_dict, coherence='c_v')
    model_val.append(mallet_nouns_only)
    model_val.append(mallet_coh_model.get_coherence())
    model_val.append(num_topic)
    mallet_noun_models.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(mallet_coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

mallet_coh_val = [coh for model, coh, topic in mallet_noun_models]

x = range(3, 10)
plt.plot(x, mallet_coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
for model in mallet_noun_models:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model[0].show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(mallet_noun_models, open("mallet_noun_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_noun_models = pickle.load(open("mallet_noun_models.pkl", "rb"))

# #3 LDA Gensim Model - Nouns and Adjectives

In [None]:
sentences_df = pd.DataFrame()
sentences_df['sentence'] = sentence_list
sentences_df['index'] = index_list
sentences_df.dropna(inplace=True)

sentences_df.head()

In [None]:
import nltk

stop_list = nltk.corpus.stopwords.words('english')
tokenizer = nltk.tokenize.word_tokenize
extra_stop_words = ['phone', 'iphone', 'nokia', 'samsung', 'htc', 'lg', 'galaxy', 'ca', 'motorola', 'android', 'verizon', 'i', '%', 't', 's']
extra_stop_words += ['s4', 's5', 's6', 's7', 's3', 's8', 's9', 'm9', 'moto']

def get_nouns_adj(sentence_row):

    noun_list = []
    sentence = nltk.pos_tag(tokenizer(sentence_row.lower()))
    for w, pos in sentence: 
        if pos == 'NN' or pos == 'JJ':
            if w not in extra_stop_words:
                noun_list.append(w)
    
    return noun_list


In [None]:
sentences_df['sentence_tags'] = sentences_df['sentence'].apply(get_nouns_adj)

In [None]:
sentences_df.head()

In [None]:
sentences_df = sentences_df[sentences_df['sentence_tags'].str.len()>0]
sentences_df.head()

In [None]:
sentence_tags = sentences_df['sentence_tags'].values.tolist()

In [None]:
import gensim

# Using gensim to create a dictionary object of all the words in all extracts
sentence_dict = gensim.corpora.Dictionary(sentence_tags)

# Returning the word vector for each extract from the gensim dict of words
sentence_vecs = [sentence_dict.doc2bow(words) for words in sentence_tags]

In [None]:
# Finding the optimal number of topics
from gensim.models import CoherenceModel

gensim_noun_adj_models = []
coh_val = []
model_topics = []

for num_topic in range(3, 10):
    model_val = []
    gensim_nouns_adj = gensim.models.ldamodel.LdaModel(corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic)
    coh_model = CoherenceModel(model=gensim_nouns_adj, texts=sentence_tags, dictionary=sentence_dict, coherence='c_v')
    model_val.append(gensim_nouns_adj)
    model_val.append(coh_model.get_coherence())
    model_val.append(num_topic)
    gensim_noun_adj_models.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

gensim_coh_val = [coh for model, coh, topic in gensim_noun_adj_models]

x = range(3, 10)
plt.plot(x, gensim_coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
for model in gensim_noun_adj_models:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model[0].show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MODELS FROM ABOVE @@@@@@@@@###
# pickle.dump(gensim_noun_adj_models, open("gensim_noun_adj_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
gensim_noun_adj_models = pickle.load(open("gensim_noun_adj_models.pkl", "rb"))

# #3 LDA Mallet Model - Nouns and Adjectives

In [None]:
import gensim
import os 
from gensim.models import CoherenceModel

os.environ.update({'MALLET_HOME':r'D:\\Softwares\\mallet-2.0.8'})
mallet_path = r'D:\\Softwares\\mallet-2.0.8\\bin\\mallet'

mallet_noun_adj_models = []

for num_topic in range(3, 10):
    model_val = []
    mallet_noun_adj = gensim.models.wrappers.LdaMallet(mallet_path, corpus=sentence_vecs, id2word=sentence_dict, num_topics=num_topic)
    mallet_coh_model = CoherenceModel(model=mallet_noun_adj, texts=sentence_tags, dictionary=sentence_dict, coherence='c_v')
    model_val.append(mallet_noun_adj)
    model_val.append(mallet_coh_model.get_coherence())
    model_val.append(num_topic)
    mallet_noun_adj_models.append(model_val)
    print("Topic " + str(num_topic) + " Score: " + str(mallet_coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

mallet_coh_val = [coh for model, coh, topic in mallet_noun_adj_models]

x = range(3, 10)
plt.plot(x, mallet_coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
for model in mallet_noun_adj_models:
    print("Model #{} Coherence Score: {}".format(model[2], model[1]))
    topics = model[0].show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(mallet_noun_adj_models, open("mallet_noun_adj_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_noun_adj_models = pickle.load(open("mallet_noun_adj_models.pkl", "rb"))

# Sentiment Analysis (Vader & TextBlob)

### Common methods between both SA Models


In [25]:
###@@@@@@@@@ THIS IS TO ***SAVE*** THE SENTENCES DF FROM ABOVE @@@@@@@@@###
#pickle.dump(sentences_df, open("sentences_df.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE SENTENCES DF FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
#sentences_df = pickle.load(open("sentences_df.pkl", "rb"))

In [26]:
#Takes in the sentence column from sentence_df, returns a list of polarity scores.
#Arg input 0 for Vader, 1 for TextBlob

def sa_score(name_df,arg_num):
    scores =[]
    
    if arg_num == 0:
        try:
            for sentence_value in name_df['sentence']:
                score = analyser.polarity_scores(sentence_value)
                scores.append(score)
        except Exception as e:
            print(str(e))
    else:        
        for sentence_value in name_df['sentence']:
            tb = TextBlob(sentence_value)
            scores.append(tb.sentiment.polarity)
    
    return scores
    

In [27]:
def normalised(df_values):

    x_2 = df_values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    x_2_scaled = min_max_scaler.fit_transform(x_2)
    return pd.DataFrame(x_2_scaled)

In [28]:
def discrepancy_score(arg_list):
    return (len(arg_list)-sum(arg_list))/len(arg_list)

In [29]:
sentences_df_sa = sentences_df.copy(deep=True)
#Drop all columns except sentence & index
sentences_df_sa.drop(sentences_df_sa.columns.difference(['sentence','index']), 1, inplace=True)

# VADER

In [30]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyser = SentimentIntensityAnalyzer()

In [31]:
vader_scores = sa_score(sentences_df_sa,0)
vader_sentencesScores = pd.concat([sentences_df_sa,pd.DataFrame(vader_scores)],axis =1, sort = False)
vader_sentencesScores['index scores'] = index_score_list
vader_sentencesScores

Unnamed: 0,sentence,index,neg,neu,pos,compound,index scores
0,As a diehard Samsung fan who has had every Sam...,0,0.000,0.813,0.187,0.6486,10.0
1,I am amazed at some of the reviews and think p...,0,0.111,0.766,0.123,0.0772,10.0
2,The battery life is amazing,0,0.000,0.513,0.487,0.5859,10.0
3,Love the phone,1,0.000,0.323,0.677,0.6369,10.0
4,the phone is sleek and smooth and beautiful I ...,1,0.000,0.653,0.347,0.8412,10.0
...,...,...,...,...,...,...,...
1025934,Came right on timr,400060,0.000,1.000,0.000,0.0000,10.0
1025935,Excelente producto,400061,0.000,1.000,0.000,0.0000,10.0
1025936,"Older Android version, too many junky apps and...",400062,0.000,1.000,0.000,0.0000,4.0
1025937,I had one die and returned it to my cell provi...,400062,0.183,0.599,0.219,0.0516,4.0


In [32]:
vader_sentencesScores_sample = vader_sentencesScores.sample(n = 80, replace = False, random_state = 2)
vader_sentencesScores_sample.to_csv(r'vader_sentencesScores_sample.csv', index = False)

In [38]:
###@@@@@@@@@ THIS IS TO ***SAVE*** THE VADER DF FROM ABOVE @@@@@@@@@###
pickle.dump(vader_sentencesScores, open("VaderSentencesScores.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE VADER DF FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
#vader_sentencesScores = pickle.load(open("VaderSentencesScores.pkl", "rb"))

## TextBlob

In [39]:
textblob_sentences = sa_score(sentences_df_sa,1)
textblob_sentencesScores= pd.DataFrame(textblob_sentences)

In [40]:
textblob_sentencesScores.rename(columns={0: 'compound'}, inplace=True)
textblob_sentencesScores

Unnamed: 0,compound
0,0.300000
1,0.000000
2,0.600000
3,0.500000
4,0.470000
...,...
1025934,0.285714
1025935,0.000000
1025936,0.222222
1025937,0.300000


In [43]:
###@@@@@@@@@ THIS IS TO ***SAVE*** THE TextBlob DF FROM ABOVE @@@@@@@@@###
pickle.dump(textblob_sentencesScores, open("TextBlobSentencesScores.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE TextBlob DF FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
#textblob_sentencesScores = pickle.load(open("TextBlobSentencesScores.pkl", "rb"))

In [44]:
textblob_sentencesScores.head(10)

Unnamed: 0,compound
0,0.3
1,0.0
2,0.6
3,0.5
4,0.47
5,0.333333
6,0.6
7,-0.2
8,0.230556
9,0.136364


## Discrepancy between Extract Review Score and SA Sentence score

In [45]:
df_normalised_scores = pd.DataFrame()

In [47]:
normalised_scores_title_list=["Vader Compound","Vader Index Score","TextBlob Compound","TextBlob Index Score"]

df_normalised_scores["Vader Compound"] = vader_sentencesScores["compound"]
df_normalised_scores["TextBlob Compound"] = textblob_sentencesScores["compound"]
df_normalised_scores["Index Scores"] = vader_sentencesScores["index scores"]

In [48]:
df_normalised_scores.head()

Unnamed: 0,Vader Compound,TextBlob Compound,Index Scores
0,0.6486,0.3,10.0
1,0.0772,0.0,10.0
2,0.5859,0.6,10.0
3,0.6369,0.5,10.0
4,0.8412,0.47,10.0


In [49]:
df_normalised_scores=normalised(df_normalised_scores.values)

In [50]:
df_normalised_scores.rename(columns={0:'Vader Compound',1:'TextBlob Compound',2:'Index Scores'}, inplace=True)

In [51]:
df_normalised_scores.head()

Unnamed: 0,Vader Compound,TextBlob Compound,Index Scores
0,0.825723,0.65,1.0
1,0.539321,0.5,1.0
2,0.794296,0.8,1.0
3,0.819859,0.75,1.0
4,0.92226,0.735,1.0


In [52]:
vader_normalised_score = []
tb_normalised_score = []
comb_normalised_score = []
comb_normalised_col = []

for index, row in df_normalised_scores.iterrows():
    vader = row['Vader Compound']
    tb = row['TextBlob Compound']
    comb = (vader+tb)/2
    comb_normalised_col.append(comb)
    index =row['Index Scores']
    
    vader = round(vader,2)
    tb = round(tb,2)
    comb = round(comb,2)
    
    if (vader>=(2/3) and index>=(2/3)):
        vader_normalised_score.append(0)
    elif((vader<(2/3) and vader>=(1/3)) and (index<(2/3) and index>=(1/3))):
        vader_normalised_score.append(0)
    elif(vader<(1/3) and index<(1/3)):
        vader_normalised_score.append(0)
    else:vader_normalised_score.append(1)
        
    if (tb>=(2/3) and index>=(2/3)):
        tb_normalised_score.append(0)
    elif((tb<(2/3) and tb>=(1/3)) and (index<(2/3) and index>=(1/3))):
        tb_normalised_score.append(0)
    elif(tb<(1/3) and index<(1/3)):
        tb_normalised_score.append(0)
    else:tb_normalised_score.append(1)
        
    if (comb>=(2/3) and index>=(2/3)):
        comb_normalised_score.append(0)
    elif((comb<(2/3) and comb>=(1/3)) and (index<(2/3) and index>=(1/3))):
        comb_normalised_score.append(0)
    elif(comb<(1/3) and index<(1/3)):
        comb_normalised_score.append(0)
    else:comb_normalised_score.append(1)
    

In [53]:
#Vader Accuracy 
vader_accuracy = discrepancy_score(vader_normalised_score)
print(vader_accuracy)

0.534153590028257


In [54]:
#Textblob Accuracy 
tb_accuracy = discrepancy_score(tb_normalised_score)
print(tb_accuracy)

0.45430186395097566


In [55]:
#Comb Accuracy 
comb_accuracy = discrepancy_score(comb_normalised_score)
print(comb_accuracy)

0.4877902097493126


In [56]:
df_normalised_scores.insert(loc=2, column='Mean Compound', value=comb_normalised_col)

In [57]:
df_normalised_scores.insert(loc=4, column='Combined normalised score', value=comb_normalised_score)

In [59]:
df_normalised_scores["sentence"]=vader_sentencesScores["sentence"]

In [60]:
df_normalised_scores.head(100)
#Combined normalised score of 0 indicates an accurate sentiment

Unnamed: 0,Vader Compound,TextBlob Compound,Mean Compound,Index Scores,Combined normalised score,sentence
0,0.825723,0.650000,0.737862,1.000000,0,As a diehard Samsung fan who has had every Sam...
1,0.539321,0.500000,0.519661,1.000000,1,I am amazed at some of the reviews and think p...
2,0.794296,0.800000,0.797148,1.000000,0,The battery life is amazing
3,0.819859,0.750000,0.784929,1.000000,0,Love the phone
4,0.922260,0.735000,0.828630,1.000000,0,the phone is sleek and smooth and beautiful I ...
...,...,...,...,...,...,...
95,0.362538,0.500000,0.431269,0.111111,1,I do not like this phone
96,0.500627,0.500000,0.500313,0.111111,1,I had it in my pocket with something else and ...
97,0.500627,0.500000,0.500313,0.111111,1,Replaced it and it has another crack in it
98,0.500627,0.500000,0.500313,0.111111,1,They need to do something different


In [None]:
#df_normalised_scores.to_csv(r'df_normalised_scores.csv', index = False)

In [None]:
#df_normalised_scores=pd.read_csv('df_normalised_scores.csv')  

In [61]:
###@@@@@@@@@ THIS IS TO ***SAVE*** THE normalised DF FROM ABOVE @@@@@@@@@###
pickle.dump(df_normalised_scores, open("df_normalised_scores.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE normalised DF FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
#df_normalised_scores = pickle.load(open("df_normalised_scores.pkl", "rb"))#

In [62]:
sa_sample = df_normalised_scores.sample(n = 80, replace = False, random_state = 42)
sa_sample.to_csv(r'sa_sample.csv', index = False)

In [63]:
df_normalised_scores

Unnamed: 0,Vader Compound,TextBlob Compound,Mean Compound,Index Scores,Combined normalised score,sentence
0,0.825723,0.650000,0.737862,1.000000,0,As a diehard Samsung fan who has had every Sam...
1,0.539321,0.500000,0.519661,1.000000,1,I am amazed at some of the reviews and think p...
2,0.794296,0.800000,0.797148,1.000000,0,The battery life is amazing
3,0.819859,0.750000,0.784929,1.000000,0,Love the phone
4,0.922260,0.735000,0.828630,1.000000,0,the phone is sleek and smooth and beautiful I ...
...,...,...,...,...,...,...
1025934,0.500627,0.642857,0.571742,1.000000,1,Came right on timr
1025935,0.500627,0.500000,0.500313,1.000000,1,Excelente producto
1025936,0.500627,0.611111,0.555869,0.333333,0,"Older Android version, too many junky apps and..."
1025937,0.526490,0.650000,0.588245,0.333333,0,I had one die and returned it to my cell provi...


## Drop all Neutrals

In [88]:
vader_no_neutrals = df_normalised_scores.copy(deep = True)
vader_no_neutrals.head()

Unnamed: 0,Vader Compound,TextBlob Compound,Mean Compound,Index Scores,Combined normalised score,sentence
0,0.825723,0.65,0.737862,1.0,0,As a diehard Samsung fan who has had every Sam...
1,0.539321,0.5,0.519661,1.0,1,I am amazed at some of the reviews and think p...
2,0.794296,0.8,0.797148,1.0,0,The battery life is amazing
3,0.819859,0.75,0.784929,1.0,0,Love the phone
4,0.92226,0.735,0.82863,1.0,0,the phone is sleek and smooth and beautiful I ...


In [89]:
vader_no_neutrals.drop(["TextBlob Compound","Mean Compound","Index Scores","Combined normalised score"],axis = 1, inplace = True)

In [86]:
(vader_no_neutrals['Vader Compound'] > 2/3)

0           True
1          False
2           True
3           True
4           True
           ...  
1025934    False
1025935    False
1025936    False
1025937    False
1025938    False
Name: Vader Compound, Length: 1025939, dtype: bool

In [90]:
vader_no_neutrals= vader_no_neutrals[(vader_no_neutrals['Vader Compound'] >= 2/3) & (1/3 >= vader_no_neutrals['Vader Compound'])]

vader_no_neutrals.head(10)

Unnamed: 0,Vader Compound,sentence


In [None]:
tb_no_neutrals = 

In [None]:
combined_no_neutrals = 

# EVERYTHING BELOW ARE VERSION 1 CODE

# LDA Model

In [None]:
# Preprocessing of extracts into words

import nltk
import re
import gensim

def clean_extract(extract_data):
    stop_list = nltk.corpus.stopwords.words('english')
    stop_list += ['phone', 'iphone', 'nokia', 'samsung', 'htc', 'lg', 'galaxy', 'ca', 'motorola', 'android', 'verizon']
    tokenizer = nltk.tokenize.word_tokenize
    stemmer = nltk.stem.porter.PorterStemmer()

    extract_list  = [tokenizer(extract) for extract in extract_data]
    extract_list = [[w.lower() for w in extract] for extract in extract_list]
    extract_list = [[w for w in extract if re.search('^[a-z]+$',w)] for extract in extract_list]
    extract_list = [[w for w in extract if w not in stop_list] for extract in extract_list]
#     extract_list = [[stemmer.stem(w) for w in extract] for extract in extract_list]
    return extract_list

In [None]:
extract_list = cleaned_df['extract'].values.tolist()
extract_words = clean_extract(extract_list)

## Gensim LDA Model

In [None]:
import gensim

# Using gensim to create a dictionary object of all the words in all extracts
extract_dict = gensim.corpora.Dictionary(extract_words)

# Returning the word vector for each extract from the gensim dict of words
extract_vecs = [extract_dict.doc2bow(extract) for extract in extract_words]

In [None]:
# Finding the optimal number of topics
from gensim.models import CoherenceModel

model_list = []
coh_val = []
model_topics = []

for num_topic in range(3, 10):
    extract_lda_gensim = gensim.models.ldamodel.LdaModel(corpus=extract_vecs, id2word=extract_dict, num_topics=num_topic)
    coh_model = CoherenceModel(model=extract_lda_gensim, texts=extract_words, dictionary=extract_dict, coherence='c_v')
    model_topics.append(num_topic)
    model_list.append(extract_lda_gensim)
    coh_val.append(coh_model.get_coherence())
    print("Topic " + str(num_topic) + " Score: " + str(coh_model.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

x = range(3, 10)
plt.plot(x, coh_val)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Gensim LDA Model - topics
val = 0
model_num = 3
for model in model_list:
    print("Model #{} Coherence Score: {}".format(model_num, coh_val[val]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    model_num += 1
    val += 1
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE GENSIM MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(model_list, open("gensim_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE GENSIM MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# gensim_models = pickle.load(open("gensim_models.pkl", "rb"))

## Mallet LDA Model

In [None]:
import gensim
import os 

os.environ.update({'MALLET_HOME':r'D:\\Softwares\\mallet-2.0.8'})
mallet_path = r'D:\\Softwares\\mallet-2.0.8\\bin\\mallet'

model_list_mallet = []
coh_val_mallet = []

for num_topic in range(3, 10):
    extract_lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path, corpus=extract_vecs, id2word=extract_dict, num_topics=num_topic)
    coh_model_mallet = CoherenceModel(model=extract_lda_mallet, texts=extract_words, dictionary=extract_dict, coherence='c_v')
    model_list_mallet.append(extract_lda_mallet)
    coh_val_mallet.append(coh_model_mallet.get_coherence())
    print("Topic " + str(num_topic) + " Score: " + str(coh_model_mallet.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

x = range(3, 10)
plt.plot(x, coh_val_mallet)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
val = 0
mallet_model_num = 3
for model in model_list_mallet:
    print("Model #{} Coherence Score: {}".format(mallet_model_num, coh_val_mallet[val]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    mallet_model_num += 1
    val += 1
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MALLET MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(model_list_mallet, open("mallet_models.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE GENSIM MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_models = pickle.load(open("mallet_models.pkl", "rb"))

In [None]:
## Another Gensim run with more words taken out from the stop list 
import nltk
import re
import gensim

def clean_extract2(extract_data):
    stop_list = nltk.corpus.stopwords.words('english')
    stop_list += ['phone', 'iphone', 'nokia', 'samsung', 'htc', 'lg', 'galaxy', 'ca', '']
    tokenizer = nltk.tokenize.word_tokenize
    stemmer = nltk.stem.porter.PorterStemmer()

    extract_list  = [tokenizer(extract) for extract in extract_data]
    extract_list = [[w.lower() for w in extract] for extract in extract_list]
    extract_list = [[w for w in extract if re.search('^[a-z]+$',w)] for extract in extract_list]
    extract_list = [[w for w in extract if w not in stop_list] for extract in extract_list]
#     extract_list = [[stemmer.stem(w) for w in extract] for extract in extract_list]
    return extract_list

In [None]:
extract_list2 = cleaned_df['extract'].values.tolist()
extract_words2 = clean_extract2(extract_list2)

In [None]:
import gensim

# Using gensim to create a dictionary object of all the words in all extracts
extract_dict2 = gensim.corpora.Dictionary(extract_words2)

# Returning the word vector for each extract from the gensim dict of words
extract_vecs2 = [extract_dict2.doc2bow(extract) for extract in extract_words2]

In [None]:
# Finding the optimal number of topics
from gensim.models import CoherenceModel

model_list2 = []
coh_val2 = []
model_topics2 = []

for num_topic in range(3, 10):
    extract_lda_gensim2 = gensim.models.ldamodel.LdaModel(corpus=extract_vecs2, id2word=extract_dict2, num_topics=num_topic)
    coh_model2 = CoherenceModel(model=extract_lda_gensim2, texts=extract_words2, dictionary=extract_dict2, coherence='c_v')
    model_topics2.append(num_topic)
    model_list2.append(extract_lda_gensim2)
    coh_val2.append(coh_model2.get_coherence())
    print("Topic " + str(num_topic) + " Score: " + str(coh_model2.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

x = range(3, 10)
plt.plot(x, coh_val2)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# 2nd Gensim LDA Model - topics
val2 = 0
model_num2 = 3
for model in model_list2:
    print("Model #{} Coherence Score: {}".format(model_num2, coh_val2[val2]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list2 = []
        for word, word_prob in word_list:
            result_list2.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list2)))
    model_num2 += 1
    val2 += 1
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MALLET MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(model_list2, open("gensim_models2.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE GENSIM MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_models = pickle.load(open("mallet_models.pkl", "rb"))

In [None]:
# 2nd Mallet LDA Model
import gensim
import os 

os.environ.update({'MALLET_HOME':r'D:\\Softwares\\mallet-2.0.8'})
mallet_path = r'D:\\Softwares\\mallet-2.0.8\\bin\\mallet'

model_list_mallet2 = []
coh_val_mallet2 = []

for num_topic in range(3, 10):
    extract_lda_mallet2 = gensim.models.wrappers.LdaMallet(mallet_path, corpus=extract_vecs2, id2word=extract_dict2, num_topics=num_topic)
    coh_model_mallet2 = CoherenceModel(model=extract_lda_mallet2, texts=extract_words2, dictionary=extract_dict2, coherence='c_v')
    model_list_mallet2.append(extract_lda_mallet2)
    coh_val_mallet2.append(coh_model_mallet2.get_coherence())
    print("Topic " + str(num_topic) + " Score: " + str(coh_model_mallet2.get_coherence()))

In [None]:
import matplotlib.pyplot as plt

x = range(3, 10)
plt.plot(x, coh_val_mallet2)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

In [None]:
# Mallet LDA Model - topics
val = 0
mallet_model_num = 3
for model in model_list_mallet2:
    print("Model #{} Coherence Score: {}".format(mallet_model_num, coh_val_mallet2[val]))
    topics = model.show_topics(formatted=False)
    for topic, word_list in topics:
        topic_num = topic + 1
        result_list = []
        for word, word_prob in word_list:
            result_list.append(word)
        print("Topic {}: {}".format(topic_num, ', '.join(result_list)))
    mallet_model_num += 1
    val += 1
    print('----------------------------------------------------------------------------')

In [None]:
import pickle 

###@@@@@@@@@ THIS IS TO ***SAVE*** THE MALLET MODELS FROM ABOVE @@@@@@@@@###
pickle.dump(model_list_mallet2, open("mallet_models2.pkl", "wb"))

###@@@@@@@@@ THIS IS TO ***OPEN*** THE GENSIM MODELS FROM THE STORED PICKLE FILE @@@@@@@@@###
### CHECKPOINT - can just load the pickle file and start running your analysis
# mallet_models = pickle.load(open("mallet_models.pkl", "rb"))

In [None]:
## Another Gensim run with more words taken out from the stop list 
import nltk
import re
import gensim

def clean_extract2(extract_data):
    stop_list = nltk.corpus.stopwords.words('english')
    stop_list += ['phone', 'iphone', 'nokia', 'samsung', 'htc', 'lg', 'galaxy', 'ca', '']
    tokenizer = nltk.tokenize.word_tokenize
    stemmer = nltk.stem.porter.PorterStemmer()

    extract_list  = [tokenizer(extract) for extract in extract_data]
    extract_list = [[w.lower() for w in extract] for extract in extract_list]
    extract_list = [[w for w in extract if re.search('^[a-z]+$',w)] for extract in extract_list]
    extract_list = [[w for w in extract if w not in stop_list] for extract in extract_list]
#     extract_list = [[stemmer.stem(w) for w in extract] for extract in extract_list]
    return extract_list

In [None]:
import nltk
import re

new_query = 'Good phone screen good battery life'
tokenizer = nltk.tokenize.word_tokenize
cleaned_query = tokenizer(new_query)
cleaned_query = [w.lower() for w in cleaned_query]
cleaned_query = [w for w in cleaned_query if re.search('^[a-z]+$',w)]

In [None]:
import gensim

# Returning the word vector for each extract from the gensim dict of words
query_vecs = extract_dict2.doc2bow(cleaned_query)

In [None]:
output = list(model_list_mallet2[4][query_vecs])

print(output)