In [1]:
# Import Dependencies
import matplotlib.pyplot as plt
import requests as req
import pandas as pd
import numpy as np
import seaborn
import random
import datetime
import time
import json
import os

from marvel_keys import apikey, privateKey, marvel_char_list

In [2]:
csv_path = os.path.join('marvel_data.csv')

marvel_df = pd.read_csv(csv_path)

In [3]:
marvel_df['name'][:2500].head()

0         Spider-Man 
1    Captain America 
2          Wolverine 
3           Iron Man 
4               Thor 
Name: name, dtype: object

In [4]:
loop_test = ['Iron Man', 'Captain America', 'Thor', 'Black Widow', 'Hulk']

## API Request and Data Retrieval

In [5]:
import hashlib
import urllib.parse
import urllib.request

In [6]:
ts = str(round(time.time()))

hsh = hashlib.md5(bytes(ts+privateKey+apikey, 'utf-8')).hexdigest()

In [7]:
url = 'https://gateway.marvel.com:443/v1/public/characters'

#Initialize python dict containing data

marvel_data = []


for hero in marvel_char_list:
    
    search_data = {'id':[],
                'name':[],
               'description':[],
                'comics_avail':[],
                'comics_list':[],
                'series_avail':[],
                'series_list':[],
                'events_avail':[],
                'events_list':[]}
    
    #Set parameters for search
    params = {'ts': ts,
            'apikey': apikey,
              'hash': hsh,
             'name': hero,
             'limit': '100'}
    
    #Set up response request
    response = req.get(url, params=params).json()
    
    try:

        #Extract data initial data
        hero_data = response['data']['results'][0]
        search_data['id'].append(hero_data['id'])
        search_data['name'].append(hero_data['name'])
        search_data['description'].append(hero_data['description'])
        search_data['comics_avail'].append(hero_data['comics']['available'])
        search_data['series_avail'].append(hero_data['series']['available'])
        search_data['events_avail'].append(hero_data['events']['available'])

        #Set length of comics, series, and events to iterate over
        comics_length = len(hero_data['comics']['items'])
        series_length = len(hero_data['series']['items'])
        events_length = len(hero_data['events']['items'])

        #Iterate through items of comics, series and events to obtain names
        for n in range(comics_length):
            comics_list = hero_data['comics']['items'][n]
            search_data['comics_list'].append(comics_list['name'])

        for n in range(series_length):
            series_list = hero_data['series']['items'][n]
            search_data['series_list'].append(series_list['name'])

        for n in range(events_length):
            events_list = hero_data['series']['items'][n]
            search_data['events_list'].append(events_list['name'])
            
        marvel_data.append(search_data.copy())
        
    except IndexError:
        print('Missing or Invalid')
        print(f'Having Error with {hero}')

Missing or Invalid
Having Error with A-Bomb
Missing or Invalid
Having Error with Abomination
Missing or Invalid
Having Error with Aegis
Missing or Invalid
Having Error with Agent X
Missing or Invalid
Having Error with Air-Walker
Missing or Invalid
Having Error with American Eagle
Missing or Invalid
Having Error with Amphibian
Missing or Invalid
Having Error with Angel
Missing or Invalid
Having Error with Angela
Missing or Invalid
Having Error with Ant-Man
Missing or Invalid
Having Error with Araٌa
Missing or Invalid
Having Error with Armor
Missing or Invalid
Having Error with Atlas
Missing or Invalid
Having Error with Azazel
Missing or Invalid
Having Error with Baron Zemo
Missing or Invalid
Having Error with Beetle
Missing or Invalid
Having Error with Bengal
Missing or Invalid
Having Error with Black Knight
Missing or Invalid
Having Error with Black Widow/Natasha Romanoff
Missing or Invalid
Having Error with Bride of Nine Spiders
Missing or Invalid
Having Error with Brotherhood of Muta

SSLError: HTTPSConnectionPool(host='gateway.marvel.com', port=443): Max retries exceeded with url: /v1/public/characters?ts=1523340357&apikey=56661bbfdf153f11659ed1a02ad357ce&hash=1afc60cd27c836a8b8a9c62bac44c8af&name=Liz+Osborn&limit=100 (Caused by SSLError(SSLError("bad handshake: SysCallError(50, 'ENETDOWN')",),))

In [None]:
len(marvel_data)

In [None]:
marvel_df = pd.DataFrame.from_dict(marvel_data)

In [None]:
#Removing Brackets

marvel_df['comics_avail'] = marvel_df['comics_avail'].str.get(0)
# marvel_df['comics_list'] = marvel_df['comics_list'].str.get(0)
marvel_df['description'] = marvel_df['description'].str.get(0)
marvel_df['events_avail'] = marvel_df['events_avail'].str.get(0)
# marvel_df['events_list'] = marvel_df['events_list'].str.get(0)
marvel_df['id'] = marvel_df['id'].str.get(0)
marvel_df['name'] = marvel_df['name'].str.get(0)
marvel_df['series_avail'] = marvel_df['series_avail'].str.get(0)
# marvel_df['series_list'] = marvel_df['series_list'].str.get(0)

In [None]:
marvel_df.iloc[0]['comics_list']

In [None]:
marvel_clean_df = marvel_df.loc[marvel_df['events_avail'] != 0]

In [None]:
marvel_clean_df

In [None]:
marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: str(x))

In [None]:
marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: x.strip('[]'))

In [None]:
marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: x.strip("''"))

In [None]:
marvel_clean_df.head()

In [None]:
# marvel_desc = marvel_df[['name', 'description']]

In [None]:
# #Removing Brackets
# marvel_desc['name'] = marvel_desc['name'].str.get(0)
# marvel_desc['description'] = marvel_desc['description'].str.get(0)

## Adapted from brandonrose.org

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
import re
import os
import codecs
from sklearn import feature_extraction

In [None]:
print(type(str(marvel_data[0]['description']).strip('[]')))

In [None]:
names = []
desc = []

for n in range(len(marvel_data)):
    names.append(marvel_data[n]['name'][0])
    desc.append(marvel_data[n]['description'][0])

In [None]:
# vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)
# print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

### Tokenizing Using PySpark and Adapation from brandonrose.org

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [None]:
# create spark app and session
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
marvel_spark = spark.createDataFrame(marvel_clean_df)

In [None]:
marvel_spark.show()

In [None]:
tokenizer = Tokenizer(inputCol="series_list", outputCol="words")

In [None]:
tokenized = tokenizer.transform(marvel_spark)

In [None]:
tokenized.show()

In [None]:
# instantiate remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [None]:
# Transform dataframe
tokenized_filt = remover.transform(tokenized)

In [None]:
tokenized_filt.show()

In [None]:
tokenized_df = tokenized_filt.toPandas()

In [None]:
spark.stop()

In [None]:
tokenized_df.head()

In [None]:
#Create necessary lists for kmeans

names_list = []
all_filtered = []
all_comics = []
all_series = []
stemmed_words = []
comics_avail_list = []
events_avail_list = []
series_avail_list = []

for n in range(len(tokenized_df)):
    names_list.append(tokenized_df.iloc[n]['name'])
    all_filtered.extend(tokenized_df.iloc[n]['filtered'])
    all_comics.append(tokenized_df.iloc[n]['comics_list'])
    all_series.append(tokenized_df.iloc[n]['series_list'])
    stemmed_words.extend(tokenized_df.iloc[n]['words'])
    comics_avail_list.append(tokenized_df.iloc[n]['comics_avail'])
    events_avail_list.append(tokenized_df.iloc[n]['events_avail'])
    series_avail_list.append(tokenized_df.iloc[n]['series_avail'])

In [None]:
len(all_series)==len(names_list)

In [None]:
all_series

In [None]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(all_series) #fit the vectorizer to descriptions

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
print

In [None]:
from sklearn.cluster import KMeans

num_clusters = 5

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
heroes = {'heroes': names_list, 'comics_available': comics_avail_list,
          'events_available': events_avail_list, 'series_available': series_avail_list,
          'series_list': all_series, 'cluster': clusters,}

frame = pd.DataFrame(heroes, index = [clusters] , columns = ['heroes', 'comics_list', 
                                                             'comics_available', 'events_available',
                                                             'series_available','cluster'])

In [None]:
frame.head()

In [None]:
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

In [None]:
# grouped = frame['rank'].groupby(frame['cluster']) #groupby cluster for aggregation purposes

# grouped.mean() #average rank (1 to 100) per cluster

In [None]:
# vocab_frame = pd.DataFrame({'words': all_filtered}, index = stemmed_words)
# print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

In [None]:
# from __future__ import print_function

# print("Top terms per cluster:")
# print()
# #sort cluster centers by proximity to centroid
# order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

# for i in range(num_clusters):
#     print("Cluster %d words:" % i, end='')
    
#     for ind in order_centroids[i, :6]: #replace 6 with n words per cluster
#         print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
#     print() #add whitespace
#     print() #add whitespace
    
#     print("Cluster %d titles:" % i, end='')
#     for title in frame.ix[i]['title'].values.tolist():
#         print(' %s,' % title, end='')
#     print() #add whitespace
#     print() #add whitespace
    
# print()
# print()

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]
print()
print()

In [None]:
kmeans = KMeans(n_clusters=8)
kmeans.fit(pos)
predicted_clusters = kmeans.predict(pos)

In [None]:
plt.scatter(pos[:, 0], pos[:, 1], c=predicted_clusters, s=50, cmap='Paired')

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#FCB230', 4:'#30BBFC'}

#set up cluster names using a dict
cluster_names = {0: 'One', 
                 1: 'Two', 
                 2: 'Three',
                3: 'Four',
                4: 'Five'}

In [None]:
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=names_list)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(17, 9)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=12, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
    ax.set_aspect('auto')
    ax.tick_params(\
        axis= 'x',          # changes apply to the x-axis
        which='both',      # both major and minor ticks are affected
        bottom='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelbottom='off')
    ax.tick_params(\
        axis= 'y',         # changes apply to the y-axis
        which='both',      # both major and minor ticks are affected
        left='off',      # ticks along the bottom edge are off
        top='off',         # ticks along the top edge are off
        labelleft='off')
    
ax.legend(numpoints=1)  #show legend with only 1 point

#add label in x,y position with the label as the film title
for i in range(len(df)):
    ax.text(df.ix[i]['x'], df.ix[i]['y'], df.ix[i]['title'], size=8)  

    
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [None]:
df.head()

In [None]:
plt.close()

In [None]:
import mpld3

In [None]:

#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and heroes
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, heroes=names_list)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.heroes]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
# html = mpld3.fig_to_html(fig)
# print(html)