In [1]:
import nltk; nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mustafatelab/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#1introduction

In [3]:
import re
import numpy as np
import pandas as pd
from pprint import pprint

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# spacy for lemmatization
import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

# Enable logging for gensim - optional
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

In [4]:
# NLTK Stop words
stop_word_supplement = ['vacation','co','https','thank','head','travel','needed_wash','destination','visit','stay','from', 'subject', 're', 'edu', 'use']
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(stop_word_supplement)

In [5]:
df = pd.read_csv('trip_data_select')
data = df.text.values.tolist()

In [46]:
def create_corpus(data):
    # Remove Emails
    data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

    # Remove new line characters
    data = [re.sub('\s+', ' ', sent) for sent in data]

    # Remove distracting single quotes
    data = [re.sub("\'", "", sent) for sent in data]
    
    data_words = list(sent_to_words(data))
    
    # Build the bigram and trigram models
    bigram = gensim.models.Phrases(data_words, min_count=5, threshold=25) # higher threshold fewer phrases.
    trigram = gensim.models.Phrases(bigram[data_words], threshold=25)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)
    # Remove Stop Words
    data_words_nostops = remove_stopwords(data_words)

    # Form Bigrams
    data_words_bigrams = make_bigrams(data_words_nostops)

    # Initialize spacy 'en' model, keeping only tagger component (for efficiency)
    # python3 -m spacy download en
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    # Do lemmatization keeping only noun, adj, vb, adv
    data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
    # Create Dictionary
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    texts = data_lemmatized

    # Term Document Frequency
    corpus = [id2word.doc2bow(text) for text in texts]
    return corpus

In [6]:
# Convert to list


# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove new line characters
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

pprint(data[:1])

['The Most Romantic Destinations for Couples https://t.co/BxQbZn74NY']


In [7]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))
print(data_words[:1])

[['the', 'most', 'romantic', 'destinations', 'for', 'couples', 'https', 'co', 'bxqbzn', 'ny']]


In [8]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=25) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=25)  

# Faster way to get a sentence clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])

['the', 'most', 'romantic', 'destinations', 'for', 'couples', 'https', 'co', 'bxqbzn', 'ny']


In [9]:
# Define functions for stopwords, bigrams, trigrams and lemmatization
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
data_words_nostops = remove_stopwords(data_words)

# Form Bigrams
data_words_bigrams = make_bigrams(data_words_nostops)

# Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
data_lemmatized = lemmatization(data_words_bigrams, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(data_lemmatized[:1])

[['romantic', 'destination', 'couple', 'bxqbzn', 'ny']]


In [11]:
# Create Dictionary
id2word = corpora.Dictionary(data_lemmatized)

# Create Corpus
texts = data_lemmatized

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

# View
print(corpus[:1])

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)]]


In [12]:
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('bxqbzn', 1),
  ('couple', 1),
  ('destination', 1),
  ('ny', 1),
  ('romantic', 1)]]

In [13]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=5, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [14]:
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.036*"tourism" + 0.034*"holidays_resort" + 0.034*"nft_nft" + 0.029*"beach" '
  '+ 0.025*"sea_airport" + 0.012*"flight" + 0.012*"take" + '
  '0.011*"wingo_cruise" + 0.011*"enjoy" + 0.010*"greece_holiday"'),
 (1,
  '0.024*"experience" + 0.017*"caribbean" + 0.010*"craftbeer" + 0.009*"happy" '
  '+ 0.008*"airline_flight" + 0.008*"time" + 0.007*"get" + 0.007*"drink" + '
  '0.007*"visit" + 0.007*"join"'),
 (2,
  '0.036*"day" + 0.024*"family" + 0.016*"time" + 0.016*"good" + 0.013*"go" + '
  '0.013*"work" + 0.009*"hotel" + 0.008*"want" + 0.008*"jazz" + 0.008*"get"'),
 (3,
  '0.023*"photography" + 0.015*"adventure" + 0.013*"island" + 0.012*"tt_ttot" '
  '+ 0.010*"explore_travele" + 0.009*"today" + 0.009*"sunset" + 0.009*"resort" '
  '+ 0.008*"life" + 0.008*"info"'),
 (4,
  '0.067*"holiday" + 0.023*"trip" + 0.020*"day" + 0.012*"view" + '
  '0.012*"beautiful" + 0.012*"photo" + 0.010*"book" + 0.009*"nature" + '
  '0.008*"private" + 0.008*"relax"')]


In [22]:
lda_model.save('lda_model')

In [15]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
#coherence_model_lda = CoherenceModel(model=lda_model, texts=data_lemmatized, dictionary=id2word, coherence='c_v')
#coherence_lda = coherence_model_lda.get_coherence()
#print('\nCoherence Score: ', coherence_lda)


Perplexity:  -8.52042986169511


In [16]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
vis

  default_term_info = default_term_info.sort_values(


In [17]:
pyLDAvis.save_html(vis, 'twitter_topic_analysis.html')

## Recommendar System

In [80]:
data_by_loc = df.copy()
data_by_loc['text']= df.groupby('place_full_name')['text'].transform(lambda x: ','.join(x))
data_by_loc = data_by_loc[['text','place_full_name']].drop_duplicates().reset_index(drop=True)
data_by_loc.head()

Unnamed: 0,text,place_full_name
0,The Most Romantic Destinations for Couples ht...,"Washington, USA"
1,Bottlenose dolphins with @HoloHoloKauai https:...,Pacific Ocean
2,It's finally the day!! Off to Kauai!\nDan maki...,"Billings, MT"
3,Flying into NY!! Look at all the pretty lights...,"Queens, NY"
4,"What a way to wake up!,The making of the ice c...","Santa Ana, CA"


In [81]:
corpus_by_loc = create_corpus(data_by_loc.text)

In [82]:
corpus_by_loc_lda = lda_model[corpus_by_loc]

In [141]:
data_by_loc_lda = pd.DataFrame(corpus_by_loc_lda)

In [142]:
data_by_loc_lda

Unnamed: 0,0,1,2
0,"[(0, 0.040303513), (1, 0.28187028), (2, 0.1984...","[(0, [4, 1, 2, 0, 3]), (1, [1, 4, 2]), (2, [1]...","[(0, [(0, 0.020836521), (1, 0.30690393), (2, 0..."
1,"[(0, 0.07161718), (2, 0.07851961), (4, 0.83865...","[(57, [4]), (67, [4]), (68, [4]), (69, [4, 2])...","[(57, [(4, 0.9999505)]), (67, [(4, 0.9988905)]..."
2,"[(0, 0.012771142), (1, 0.012037741), (2, 0.011...","[(70, [4, 3]), (75, [4, 3]), (81, [3]), (82, [...","[(70, [(3, 0.23725185), (4, 0.6297683)]), (75,..."
3,"[(0, 0.114839934), (2, 0.05431315), (3, 0.1642...","[(0, [4, 3, 0, 2]), (3, [4, 3, 0, 2]), (18, [4...","[(0, [(0, 0.07500545), (2, 0.05247782), (3, 0...."
4,"[(0, 0.080583505), (1, 0.22797517), (2, 0.4385...","[(35, [4]), (75, [2, 4, 1, 3, 0]), (133, [2, 1...","[(35, [(4, 0.9984949)]), (75, [(0, 0.026714277..."
...,...,...,...
770,"[(0, 0.5502151), (2, 0.4319763)]","[(28, [0, 2]), (58, [2, 0]), (140, [2, 0]), (2...","[(28, [(0, 1.8791203), (2, 0.0933777)]), (58, ..."
771,"[(0, 0.021037983), (1, 0.019828629), (2, 0.914...","[(2497, [2]), (3035, [2]), (5305, [2])]","[(2497, [(2, 0.9724665)]), (3035, [(2, 2.71109..."
772,"[(0, 0.034875106), (1, 0.032871824), (2, 0.858...","[(104, [2])]","[(104, [(2, 2.609459)])]"
773,"[(0, 0.01359234), (1, 0.012812037), (2, 0.9448...","[(24, [2]), (70, [2])]","[(24, [(2, 3.9997163)]), (70, [(2, 3.5464509)])]"


In [143]:
data_by_loc_lda_df = pd.DataFrame.from_records(data_by_loc_lda[0].apply(lambda x : {val[0]:val[1] for val in x}))
data_by_loc_lda_df.columns = ['group_1','group_2','group_3','group_4','group_5']

In [144]:
data_by_loc_lda_df.idxmax(axis=1)

0      group_5
1      group_5
2      group_5
3      group_5
4      group_3
        ...   
770    group_1
771    group_3
772    group_3
773    group_3
774    group_3
Length: 775, dtype: object

In [145]:
data_by_loc['group']=data_by_loc_lda_df.idxmax(axis=1)

In [147]:
data_by_loc.to_csv('place_groups')