In [1]:
import pandas as pd

robot_data = pd.read_csv('robot_dataset.csv')
robot_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 493 entries, 0 to 492
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Unnamed: 0             493 non-null    int64  
 1   adult                  493 non-null    bool   
 2   backdrop_path          344 non-null    object 
 3   belongs_to_collection  98 non-null     object 
 4   budget                 493 non-null    int64  
 5   genres                 493 non-null    object 
 6   homepage               148 non-null    object 
 7   id                     493 non-null    int64  
 8   imdb_id                453 non-null    object 
 9   original_language      493 non-null    object 
 10  original_title         493 non-null    object 
 11  overview               488 non-null    object 
 12  popularity             493 non-null    float64
 13  poster_path            447 non-null    object 
 14  production_companies   493 non-null    object 
 15  produc

In [2]:
text_data = robot_data[['title', 'tagline', 'overview']].dropna()
text_data.head()

Unnamed: 0,title,tagline,overview
0,Finch,Take an unforgettable journey.,"On a post-apocalyptic Earth, a robot, built to..."
1,Godzilla vs. Kong,One Will Fall,"In a time when monsters walk the Earth, humani..."
2,Pacific Rim,"To Fight Monsters, We Created Monsters",A ragtag band of humans band together in the y...
3,Real Steel,"If you get one shot, make it real.",Charlie Kenton is a washed-up fighter who reti...
4,Outside the Wire,Defiant by design.,"In the near future, a drone pilot is sent into..."


In [3]:
import string
import re

remove_string = '\w*robot\w*'
#remove original keywords
text_data['overview_processed'] = text_data['overview'].map(lambda x: re.sub(r'{}'.format(remove_string), '', x))
# Remove punctuation
text_data['overview_processed'] = text_data['overview_processed'].map(lambda x: re.sub(r'[,\.!?]', '', x))
# Convert the titles to lowercase
text_data['overview_processed'] = text_data['overview_processed'].map(lambda x: x.lower())
# # Print out the first rows of papers
text_data['overview_processed'].head()

0    on a post-apocalyptic earth a  built to protec...
1    in a time when monsters walk the earth humanit...
2    a ragtag band of humans band together in the y...
3    charlie kenton is a washed-up fighter who reti...
4    in the near future a drone pilot is sent into ...
Name: overview_processed, dtype: object

In [4]:
import gensim
from gensim.utils import simple_preprocess
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])

def sent_to_words(sentences):
    for sentence in sentences:
        # deacc=True removes punctuations
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc))
             if word not in stop_words] for doc in texts]

data = text_data['overview_processed'].values.tolist()
data_words = list(sent_to_words(data))
# remove stop words
data_words = remove_stopwords(data_words)
print(data_words[:1][0][:30])

['post', 'apocalyptic', 'earth', 'built', 'protect', 'life', 'dying', 'creator', 'beloved', 'dog', 'learns', 'life', 'love', 'friendship', 'means', 'human']


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\marti\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [5]:
import gensim.corpora as corpora
# Create Dictionary
id2word = corpora.Dictionary(data_words)
# Create Corpus
texts = data_words
# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]
# View
print(corpus[:1][0][:30])

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 2), (11, 1), (12, 1), (13, 1), (14, 1)]


In [6]:
from pprint import pprint
# number of topics
num_topics = 10
# Build LDA model
lda_model = gensim.models.LdaMulticore(corpus=corpus,
                                       id2word=id2word,
                                       num_topics=num_topics)
# Print the Keyword in the 10 topics
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.010*"world" + 0.007*"must" + 0.004*"city" + 0.004*"day" + 0.004*"future" '
  '+ 0.004*"cyborg" + 0.004*"war" + 0.004*"secret" + 0.004*"first" + '
  '0.003*"one"'),
 (1,
  '0.006*"must" + 0.005*"new" + 0.005*"kung" + 0.004*"named" + 0.004*"future" '
  '+ 0.004*"satan" + 0.004*"world" + 0.004*"human" + 0.004*"one" + '
  '0.004*"chitti"'),
 (2,
  '0.008*"earth" + 0.008*"space" + 0.007*"must" + 0.004*"godzilla" + '
  '0.004*"one" + 0.004*"life" + 0.004*"future" + 0.003*"unit" + 0.003*"new" + '
  '0.003*"powerful"'),
 (3,
  '0.008*"life" + 0.008*"one" + 0.008*"earth" + 0.007*"find" + 0.005*"quaid" + '
  '0.004*"human" + 0.004*"police" + 0.004*"world" + 0.004*"planet" + '
  '0.004*"future"'),
 (4,
  '0.006*"save" + 0.005*"earth" + 0.004*"stop" + 0.004*"series" + '
  '0.004*"world" + 0.004*"discovers" + 0.004*"planet" + 0.004*"must" + '
  '0.004*"becomes" + 0.004*"fight"'),
 (5,
  '0.007*"earth" + 0.006*"human" + 0.005*"time" + 0.005*"one" + 0.005*"big" + '
  '0.004*"last" + 0.004*"

In [7]:
import pyLDAvis.gensim_models
import pickle
import pyLDAvis
import os
# Visualize the topics
pyLDAvis.enable_notebook()
LDAvis_data_filepath = os.path.join('./ldavis_prepared_'+str(num_topics))
# # this is a bit time consuming - make the if statement True
# # if you want to execute visualization prep yourself
if 1 == 1:
    LDAvis_prepared = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word)
    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)
pyLDAvis.save_html(LDAvis_prepared, './ldavis_prepared_'+ str(num_topics) +'.html')
LDAvis_prepared

  default_term_info = default_term_info.sort_values(
