In [372]:
# Import Dependencies
import matplotlib.pyplot as plt
import requests as req
import pandas as pd
import numpy as np
import seaborn
import random
import datetime
import time
import json
import os

from marvel_keys import apikey, privateKey, marvel_char_list 

In [365]:
csv_path = os.path.join('marvel_data.csv')

marvel_df = pd.read_csv(csv_path)

In [366]:
marvel_df['name'][:2800].head()

0         Spider-Man 
1    Captain America 
2          Wolverine 
3           Iron Man 
4               Thor 
Name: name, dtype: object

In [367]:
loop_test = ['Iron Man', 'Captain America', 'Thor', 'Black Widow', 'Hulk']

## API Request and Data Retrieval

In [368]:
import hashlib
import urllib.parse
import urllib.request

In [369]:
url = 'https://gateway.marvel.com:443/v1/public/characters'

ts = str(round(time.time()))

hsh = hashlib.md5(bytes(ts+privateKey+apikey, 'utf-8')).hexdigest()

In [371]:
url = 'https://gateway.marvel.com:443/v1/public/characters'

#Initialize python dict containing data

marvel_data = []

for hero in marvel_char_list:
    
    search_data = {'id':[],
                'name':[],
               'description':[],
                'comics_avail':[],
                'comics_list':[],
                'series_avail':[],
                'series_list':[],
                'events_avail':[],
                'events_list':[]}
    
    #Set parameters for search
    params = {'ts': ts,
            'apikey': apikey,
              'hash': hsh,
             'name': hero,
             'limit': '100'}
    
    #Set up response request
    response = req.get(url, params=params).json()
    print(response)
    
    try:
        #Extract data initial data
        hero_data = response['data']['results'][0]
        search_data['id'].append(hero_data['id'])
        search_data['name'].append(hero_data['name'])
        search_data['description'].append(hero_data['description'])
        search_data['comics_avail'].append(hero_data['comics']['available'])
        search_data['series_avail'].append(hero_data['series']['available'])
        search_data['events_avail'].append(hero_data['events']['available'])

        #Set length of comics, series, and events to iterate over
        comics_length = len(hero_data['comics']['items'])
        series_length = len(hero_data['series']['items'])
        events_length = len(hero_data['events']['items'])

        #Iterate through items of comics, series and events to obtain names
        for n in range(comics_length):
            comics_list = hero_data['comics']['items'][n]
            search_data['comics_list'].append(comics_list['name'])

        for n in range(series_length):
            series_list = hero_data['series']['items'][n]
            search_data['series_list'].append(series_list['name'])

        for n in range(events_length):
            events_list = hero_data['events']['items'][n]
            search_data['events_list'].append(events_list['name'])
            
        marvel_data.append(search_data.copy())
        
    except IndexError:
        print(f'Having Error with {hero}')

{'code': 'RequestThrottled', 'message': 'You have exceeded your rate limit.  Please try again later.'}
{'code': 'RequestThrottled', 'message': 'You have exceeded your rate limit.  Please try again later.'}
{'code': 'RequestThrottled', 'message': 'You have exceeded your rate limit.  Please try again later.'}
{'code': 'RequestThrottled', 'message': 'You have exceeded your rate limit.  Please try again later.'}
{'code': 'RequestThrottled', 'message': 'You have exceeded your rate limit.  Please try again later.'}


In [None]:
len(marvel_data)

In [None]:
marvel_df = pd.DataFrame.from_dict(marvel_data)

In [None]:
marvel_df.head()

In [None]:
#Removing Brackets

marvel_df['comics_avail'] = marvel_df['comics_avail'].str.get(0)
# marvel_df['comics_list'] = marvel_df['comics_list'].str.get(0)
marvel_df['description'] = marvel_df['description'].str.get(0)
marvel_df['events_avail'] = marvel_df['events_avail'].str.get(0)
marvel_df['id'] = marvel_df['id'].str.get(0)
marvel_df['name'] = marvel_df['name'].str.get(0)
marvel_df['series_avail'] = marvel_df['series_avail'].str.get(0)
# marvel_df['series_list'] = marvel_df['series_list'].str.get(0)
# marvel_df['events_list'] = marvel_df['events_list'].str.get(0)

In [None]:
marvel_clean_df = marvel_df.loc[marvel_df['series_avail'] != 0]

In [None]:
len(marvel_clean_df)

In [None]:
#Remove brackets and quotations
marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: str(x))
marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: x.strip('[]'))
# marvel_clean_df['series_list'] = marvel_clean_df['series_list'].map(lambda x: x.strip("''"))

In [None]:
#Remove brackets and quotations
marvel_clean_df['comics_list'] = marvel_clean_df['comics_list'].map(lambda x: str(x))
marvel_clean_df['comics_list'] = marvel_clean_df['comics_list'].map(lambda x: x.strip('[]'))
# marvel_clean_df['comics_list'] = marvel_clean_df['comics_list'].map(lambda x: x.strip("''"))

In [None]:
#Remove brackets and quotations
marvel_clean_df['events_list'] = marvel_clean_df['events_list'].map(lambda x: str(x))
marvel_clean_df['events_list'] = marvel_clean_df['events_list'].map(lambda x: x.strip('[]'))
# marvel_clean_df['events_list'] = marvel_clean_df['events_list'].map(lambda x: x.strip("''"))

In [None]:
marvel_clean_df.head()

In [None]:
marvel_clean_df['Combined'] = marvel_clean_df[['comics_list', 'events_list', 'series_list']].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
                                                                                                                           
                                                                                                                           
                                                                                                                                            

In [None]:
marvel_clean_df['Combined'].head()

## Adapted from brandonrose.org

In [None]:
import numpy as np
import pandas as pd
import nltk
nltk.download('stopwords')
import re
import os
import codecs
from sklearn import feature_extraction

## Tokenizing Using PySpark and Adapation from brandonrose.org

In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover

In [None]:
# create spark app and session
spark = SparkSession.builder.appName('nlp').getOrCreate()

In [None]:
marvel_spark = spark.createDataFrame(marvel_clean_df)

In [None]:
marvel_spark.show()

In [None]:
tokenizer = Tokenizer(inputCol="Combined", outputCol="words")

In [None]:
tokenized = tokenizer.transform(marvel_spark)

In [None]:
tokenized.show()

In [None]:
# instantiate remover
remover = StopWordsRemover(inputCol="words", outputCol="filtered")

In [None]:
# Transform dataframe
tokenized_filt = remover.transform(tokenized)

In [None]:
tokenized_filt.show()

In [None]:
tokenized_df = tokenized_filt.toPandas()

In [None]:
spark.stop()

In [None]:
tokenized_df.head()

In [None]:
tokenized_df = tokenized_df.loc[tokenized_df['comics_avail'] > 10]

In [None]:
len(tokenized_df)

In [None]:
#Create necessary lists for kmeans

names_list = []
all_filtered = []
all_comics = []
all_series = []
all_events = []
stemmed_words = []
comics_avail_list = []
events_avail_list = []
series_avail_list = []
everything_list = []

for n in range(len(tokenized_df)):
    names_list.append(tokenized_df.iloc[n]['name'])
    all_filtered.extend(tokenized_df.iloc[n]['filtered'])
    all_comics.append(tokenized_df.iloc[n]['comics_list'])
    all_series.append(tokenized_df.iloc[n]['series_list'])
    all_events.append(tokenized_df.iloc[n]['events_list'])
    stemmed_words.extend(tokenized_df.iloc[n]['words'])
    comics_avail_list.append(tokenized_df.iloc[n]['comics_avail'])
    events_avail_list.append(tokenized_df.iloc[n]['events_avail'])
    series_avail_list.append(tokenized_df.iloc[n]['series_avail'])
    everything_list.append(tokenized_df.iloc[n]['Combined'])

In [None]:
len(everything_list)==len(names_list)

In [None]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [None]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorized_list = all_series

#define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                 min_df=0.2, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(vectorized_list) #fit the vectorizer to descriptions

print(tfidf_matrix.shape)

In [None]:
terms = tfidf_vectorizer.get_feature_names()

In [None]:
terms

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)
print
print

In [None]:
from sklearn.cluster import KMeans

num_clusters = 6

km = KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)

clusters = km.labels_.tolist()

In [None]:
from sklearn.externals import joblib

#uncomment the below to save your model 
#since I've already run my model I am loading from the pickle

joblib.dump(km,  'doc_cluster.pkl')

km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

In [None]:
heroes = {'heroes': names_list, 'comics_available': comics_avail_list,
          'events_available': events_avail_list, 'series_available': series_avail_list,
          'comics+series+events': everything_list, 'cluster': clusters,}

frame = pd.DataFrame(heroes, index = [clusters] , columns = ['heroes', 
                                                             'comics_available', 'events_available',
                                                             'series_available','cluster', 'comics+series+events'])

In [None]:
frame.head()

In [None]:
frame['cluster'].value_counts() #number of films per cluster (clusters from 0 to 4)

In [None]:
import os  # for os.path.basename

import matplotlib.pyplot as plt
import matplotlib as mpl

from sklearn.manifold import MDS

MDS()

# convert two components as we're plotting points in a two-dimensional plane
# "precomputed" because we provide a distance matrix
# we will also specify `random_state` so the plot is reproducible.
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)

pos = mds.fit_transform(dist)  # shape (n_components, n_samples)

xs, ys = pos[:, 0], pos[:, 1]

## KMeans Testing

In [None]:
kmeans = KMeans(n_clusters=6)
kmeans.fit(pos)
predicted_clusters = kmeans.predict(pos)

In [None]:
import matplotlib.pyplot as plt
 
# Get current size
fig_size = plt.rcParams["figure.figsize"]
 
# Prints: [8.0, 6.0]
print("Current size:", fig_size)
 
# Set figure width to 12 and height to 9
fig_size[0] = 8
fig_size[1] = 5
plt.rcParams["figure.figsize"] = fig_size


In [None]:
from matplotlib.lines import Line2D

plt.scatter(pos[:, 0], pos[:, 1], 
            c=predicted_clusters, s=100, 
            cmap='Paired')

legend_elements = [Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#A4CFE0',
                          label='Avengers/Fantastic Four', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#9EC6DF',
                          label='Avengers Villians', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#FFBE6E',
                          label='Guardians of the Galaxy', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#A94D24',
                          label='Fantastic Four', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#B2DF8A',
                          label='X-Men', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#B2DF8A',
                          label='Runaways', markersize=15),
                  Line2D([0], [0], marker='o', color='w',
                         markerfacecolor='#FFFCB6',
                          label='Hulk', markersize=15)]

# plt.legend(handles=legend_elements, loc='upper right', bbox_to_anchor=(1.3, 1.014))

# plt.savefig('marvel_cluster_byEverything_7.png', dpi=600)

## Setting up Cluster Chart demonstrated by brandonrose.org

In [None]:
#set up colors per clusters using a dict
cluster_colors = {0: '#1b9e77', 1: '#d95f02', 2: '#7570b3', 3: '#FCB230', 4:'#30BBFC', 5:'#3058FC', 6:'#FC30F3',
                 7: '#9BE3E5'}

#set up cluster names using a dict
cluster_names = {0: 'Guardians of the Galaxy', 
                 1: 'Fantastic Four/Avengers', 
                 2: 'Spider-Man',
                3: 'Four',
                4: 'X-Men',
                5: 'Other',
                6: 'Test',
                7: 'Test'}

In [None]:
#some ipython magic to show the matplotlib plots inline
%matplotlib inline 

#create data frame that has the result of the MDS plus the cluster numbers and titles
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, title=names_list)) 

#group by cluster
groups = df.groupby('label')


# set up plot
fig, ax = plt.subplots(figsize=(10, 8)) # set size
ax.margins(0.05) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    ax.plot(group.x, group.y, marker='o', linestyle='', ms=14, 
            label=cluster_names[name], color=cluster_colors[name], 
            mec='none')
#     ax.set_aspect('auto')
#     ax.tick_params(\
#         axis= 'x',          
#         which='both',      
#         bottom='off',     
#         top='off',        
#         labelbottom='off')
#     ax.tick_params(\
#         axis= 'y',         
#         which='both',     
#         left='off',      
#         top='off',        
#         labelleft='off')
    
ax.legend(loc='upper right', bbox_to_anchor=(1.3, 1.01))  #show legend with only 1 point
    
plt.show() #show the plot

#uncomment the below to save the plot if need be
#plt.savefig('clusters_small_noaxes.png', dpi=200)

In [None]:
plt.close()

In [None]:
import mpld3

In [None]:

#define custom toolbar location
class TopToolbar(mpld3.plugins.PluginBase):
    """Plugin for moving toolbar to top of figure"""

    JAVASCRIPT = """
    mpld3.register_plugin("toptoolbar", TopToolbar);
    TopToolbar.prototype = Object.create(mpld3.Plugin.prototype);
    TopToolbar.prototype.constructor = TopToolbar;
    function TopToolbar(fig, props){
        mpld3.Plugin.call(this, fig, props);
    };

    TopToolbar.prototype.draw = function(){
      // the toolbar svg doesn't exist
      // yet, so first draw it
      this.fig.toolbar.draw();

      // then change the y position to be
      // at the top of the figure
      this.fig.toolbar.toolbar.attr("x", 150);
      this.fig.toolbar.toolbar.attr("y", 400);

      // then remove the draw function,
      // so that it is not called again
      this.fig.toolbar.draw = function() {}
    }
    """
    def __init__(self):
        self.dict_ = {"type": "toptoolbar"}

In [None]:
#create data frame that has the result of the MDS plus the cluster numbers and heroes
df = pd.DataFrame(dict(x=xs, y=ys, label=clusters, heroes=names_list)) 

#group by cluster
groups = df.groupby('label')

#define custom css to format the font and to remove the axis labeling
css = """
text.mpld3-text, div.mpld3-tooltip {
  font-family:Arial, Helvetica, sans-serif;
}

g.mpld3-xaxis, g.mpld3-yaxis {
display: none; }

svg.mpld3-figure {
margin-left: -200px;}
"""

# Plot 
fig, ax = plt.subplots(figsize=(14,6)) #set plot size
ax.margins(0.03) # Optional, just adds 5% padding to the autoscaling

#iterate through groups to layer the plot
#note that I use the cluster_name and cluster_color dicts with the 'name' lookup to return the appropriate color/label
for name, group in groups:
    points = ax.plot(group.x, group.y, marker='o', linestyle='', ms=18, 
                     label=cluster_names[name], mec='none', 
                     color=cluster_colors[name])
    ax.set_aspect('auto')
    labels = [i for i in group.heroes]
    
    #set tooltip using points, labels and the already defined 'css'
    tooltip = mpld3.plugins.PointHTMLTooltip(points[0], labels,
                                       voffset=10, hoffset=10, css=css)
    #connect tooltip to fig
    mpld3.plugins.connect(fig, tooltip, TopToolbar())    
    
    #set tick marks as blank
    ax.axes.get_xaxis().set_ticks([])
    ax.axes.get_yaxis().set_ticks([])
    
    #set axis as blank
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)

    
ax.legend(numpoints=1) #show legend with only one dot

mpld3.display() #show the plot

#uncomment the below to export to html
#html = mpld3.fig_to_html(fig)
#print(html)

In [None]:
df.head()

In [None]:
df['size'] = list(frame.comics_available)

In [None]:
df_wo_outliers = df.loc[df['label'] != 6]

In [None]:
df_wo_outliers.head()

In [None]:
df_wo_outliers.to_csv('marvel_cluster_no_OL.csv', index=False)

In [None]:
len(df.loc[df['label'] == 2])