<a href="https://colab.research.google.com/github/marcolussetti/opendotadump-tools/blob/master/analysis/heroes_picks/LookupSpikes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Graph Hero Picks
This notebook will produce graphs of the heroes picked on a per-day basis.

## Processing

### Imports & Configuration

In [0]:
# If plotnine is not installed:
!pip install plotnine

# If using on google colab, might need to update statsmodels version
!pip install statsmodels -U

# If not installed
!pip install requests

### Constants for configuration

In [0]:
CSV_INPUT_FILE = "https://raw.githubusercontent.com/marcolussetti/processopendota/master/data/heroes_picks_csvs/stable-picks_heroes-names_normalized.csv"
OPENDOTA_API_HEROES_ENDPOINT = "https://api.opendota.com/api/heroes/"

In [0]:
import pandas as pd
import requests
from plotnine import *
from scipy.spatial import distance
from datetime import datetime, timedelta

In [0]:
%matplotlib inline

### Import data

In [0]:
# Load input csv
df = pd.read_csv(CSV_INPUT_FILE, index_col=0)

### Examine the data

#### Overall heroes  metrics

In [0]:
# Most popular heroes overall (mean)
heroes_most_popular = df.mean().sort_values(ascending=False)[:10]  # Average of normalized pick frequency

# Heroes with the most variation
heroes_most_variation = df.std().sort_values(ascending=False)[:10]  # Standard deviation of pick frequency

# Heroes with the least variation
heroes_least_variation = df.std().sort_values(ascending=False)[:10]  # Standard deviation of pick frequency

#### Reformat data for "easy" graphing

In [0]:
df_expl_graph = df.copy(deep=True)
# Condense values
df_expl_graph = df_expl_graph.stack()
df_expl_graph = df_expl_graph.reset_index()

df_expl_graph.columns = ["Day", "Hero", "Frequency"]

df_expl_graph["Day"] = df_expl_graph["Day"].apply(pd.to_datetime)

df_expl_graph["Week"] = df_expl_graph["Day"].apply(lambda date: "{}-{}".format(date.year,date.week))
df_expl_graph["Month"] = df_expl_graph["Day"].apply(lambda date: "{}-{}".format(date.year,date.month))
df_expl_graph["Year"] = df_expl_graph["Day"].apply(lambda date: date.year)


In [0]:
df_most_popular_graph = df_expl_graph[df_expl_graph["Hero"].isin(heroes_most_popular.keys())]

#### Graphs (ignore)

In [0]:
all_day_plot = (ggplot(df_expl_graph, aes(x="Day", y="Frequency", color="Hero", group=1))
              +geom_point()
             )
# all_day_plot
# all_day_plot.save("all_day_plot.png", width=40, height=32, dpi=300, limitsize=False)

In [0]:
all_day_stacked_plot = (ggplot(df_expl_graph, aes(x="Day", y="Frequency"))
              +geom_area(aes(fill="Hero"))
             )
# all_day_stacked_plot
# all_day_stacked_plot.save("all_day_stacked_plot.png", width=44, height=12, dpi=300, limitsize=False)

#### Detect changes over time

In [0]:
def previous_distribution_vector(df, date_start, date_end, average_function="mean"):
  df_filtered = df[(df["Day"] >= date_start) & (df["Day"] < date_end)]
  
  if average_function == "median":
    result = df_filtered.groupby(["Hero"]).median()[["Frequency"]]
  else:
    result = df_filtered.groupby(["Hero"]).mean()[["Frequency"]]
  
  return result.to_dict()["Frequency"]

def compute_distance(day, previous_period_average, distance_function=distance.euclidean, weighted=False):
  previous = [value for key, value 
              in sorted(previous_period_average.items(), key=lambda x: x[0])]
  current = [value for key, value 
              in sorted(day.items(), key=lambda x: x[0])]
  assert len(previous) == len(current), "Incorrect length: previous-> {}, current-> {}".format(len(previous), len(current))
  if weighted:
    return distance_function(previous, current, previous)
  else:
    return distance_function(previous, current)
  
def day_difference(df, day, distance_function=distance.euclidean, length=14, average_function="mean", weighted=False):
  # Extract vector for day
  day_picks = {record["Hero"]: record["Frequency"] for record 
               in df[df["Day"] == day][["Hero", "Frequency"]].to_dict('records')
              }
  previous_picks = previous_distribution_vector(df, datetime.strptime(day, '%Y-%m-%d') - timedelta(days=length), day, average_function)
  
  return compute_distance(day_picks, previous_picks, distance_function, weighted)

def all_days_difference(df, distance_function=distance.euclidean, length=14, average_function="mean", weighted=False):
  all_days = [str(d) for d in sorted(set(date.date() for key, date in df["Day"].to_dict().items()))[1:]]
  
  return {day: day_difference(
      df, day, distance_function=distance_function, 
      length=length, average_function="mean") 
   for day in all_days}

#### Graph differences (ignore)

In [0]:
# Try to graph differences for top 10 champions by popularity
pop_differences_by_day = all_days_difference(df_most_popular_graph).items()
sorted_pop_differences_by_day = sorted(pop_differences_by_day, key=lambda x: x[1], reverse=True)
df_pop_differences_by_day = pd.DataFrame(pop_differences_by_day)
df_pop_differences_by_day.columns = ["Day", "Difference"]

popular_heroes_differences_plot = (
    ggplot(df_pop_differences_by_day, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
)

# popular_heroes_differences_plot
# popular_heroes_differences_plot.save("pop_differences_plot.png", width=44, height=5, dpi=300, limitsize=False)

In [0]:
# Try to graph differences for all heroes

all_differences_by_day = all_days_difference(df_expl_graph).items()
sorted_all_differences_by_day = sorted(all_differences_by_day, key=lambda x: x[1], reverse=True)
df_all_differences_by_day = pd.DataFrame(all_differences_by_day)
df_all_differences_by_day.columns = ["Day", "Difference"]
df_all_differences_by_day.head()

all_heroes_differences_plot = (
    ggplot(df_all_differences_by_day, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
)

# all_heroes_differences_plot
# all_heroes_differences_plot.save("all_differences_plot.png", width=44, height=5, dpi=300, limitsize=False)

In [0]:
# What if we weight it?
# Try to graph differences for all heroes

all_differences_by_day_weighted = all_days_difference(df_expl_graph, weighted=True).items()
sorted_all_differences_by_day_weighted = sorted(all_differences_by_day_weighted, key=lambda x: x[1], reverse=True)
df_all_differences_by_day_weighted = pd.DataFrame(all_differences_by_day_weighted)
df_all_differences_by_day_weighted.columns = ["Day", "Difference"]
df_all_differences_by_day_weighted.head()

all_heroes_differences_weighted_plot = (
    ggplot(df_all_differences_by_day_weighted, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
)

# all_heroes_differences_weighted_plot
# all_heroes_differences_plot.save("all_differences_weighted_plot.png", width=44, height=5, dpi=300, limitsize=False)

In [0]:
# Try to graph differences for all heroes, 28 days

all_differences_by_day_28 = all_days_difference(df_expl_graph, length=28).items()
sorted_all_differences_by_day_28 = sorted(all_differences_by_day_28, key=lambda x: x[1], reverse=True)
df_all_differences_by_day_28 = pd.DataFrame(all_differences_by_day_28)
df_all_differences_by_day_28.columns = ["Day", "Difference"]
df_all_differences_by_day_28.head()

all_differences_by_day_28_plot = (
    ggplot(df_all_differences_by_day_28, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
)

# all_differences_by_day_28_plot
# all_differences_by_day_28_plot.save("all_differences_28_plot.png", width=44, height=5, dpi=300, limitsize=False)

In [0]:
# Try to graph differences for all heroes, manhattan distance

all_differences_by_day_manhattan = all_days_difference(df_expl_graph, distance_function=distance.cityblock).items()
sorted_all_differences_by_day_manhattan = sorted(all_differences_by_day_manhattan, key=lambda x: x[1], reverse=True)
df_all_differences_by_day_manhattan = pd.DataFrame(all_differences_by_day_manhattan)
df_all_differences_by_day_manhattan.columns = ["Day", "Difference"]

all_heroes_differences_manhattan_plot = (
    ggplot(df_all_differences_by_day_manhattan, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
)

# all_heroes_differences_manhattan_plot
# all_heroes_differences_manhattan_plot.save("all_differences_manhattan_plot.png", width=44, height=5, dpi=300, limitsize=False)

## Poster graphs (ignore)

### Pick rates by hero (stacked) graph

In [0]:
poster_stacked = (
    ggplot(df_expl_graph, aes(x="Day", y="Frequency"))
    +geom_area(aes(fill="Hero"), color="white")
    +guides(fill=guide_legend(ncol=3, title="Heroes"))
    +scale_x_datetime(date_breaks="6 months", minor_breaks=4, limits=["2012-01-01", "2016-01-01"])
    +ggtitle("Heroes Pick Rates (2011-11-22 - 2016-04-23)")
    +xlab("")
    +ylab("Pick Rate")
    +theme(
        text=element_text(family=['serif']),
        axis_text=element_text(size=24.0),
        #axis_text_x=element_text(ha="right"),
        axis_title_y=element_text(size=36.0),
        axis_title_x=element_text(size=0.0),
        legend_title=element_blank(),
        plot_title=element_text(size=72.0),
        axis_text_x=element_text(size=18.0, family=['Dejavu Sans', 'Dejavu']),#, angle=45),
        panel_background=element_rect(fill="white", colour="white"),
    )
)
# poster_stacked
poster_stacked.save("poster_stacked_white_46x12.png", width=46, height=12, dpi=300, limitsize=False)

### Difference graph

In [0]:
poster_differences_plot = (
    ggplot(df_all_differences_by_day_manhattan, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
    +scale_color_gradient(low="#5D5DA9", high="#DC5657")
    +scale_fill_gradient(low="#5D5DA9", high="#DC5657")
    +scale_x_datetime(date_breaks="6 months", minor_breaks=4, limits=["2012-01-01", "2016-01-01"])
    +xlab("")
    +theme(
        text=element_text(family=['serif']),
        panel_background=element_rect(fill="white", colour="white"),
        panel_grid=element_blank(),
        axis_text=element_text(size=24.0),
        axis_title_y=element_text(size=36.0),
        axis_title_x=element_text(size=0.0),
        axis_text_x=element_text(size=18.0, family=['Dejavu Sans', 'Dejavu']),
        legend_text=element_text(family=['Dejavu Sans', 'Dejavu']),
        legend_title=element_blank(),
    )
)

# poster_differences_plot
poster_differences_plot.save("poster_differences_white_41x5o.png", width=41, height=5, dpi=300, limitsize=False)

#### Differences graph (3mo)

In [0]:
poster_differences_plot_3mo = (
    ggplot(df_all_differences_by_day_manhattan, aes(x="Day", y="Difference", color="Difference"))
    +geom_point()
    +geom_area(aes(fill="Difference"))
    +scale_color_gradient(low="#5D5DA9", high="#DC5657")
    +scale_fill_gradient(low="#5D5DA9", high="#DC5657")
    +scale_x_datetime(date_breaks="3 months", minor_breaks=4, limits=["2012-01-01", "2016-01-01"])
    +xlab("")
    +theme(
        text=element_text(family=['serif']),
        panel_background=element_rect(fill="white", colour="white"),
        panel_grid=element_blank(),
        axis_text=element_text(size=24.0),
        axis_title_y=element_text(size=36.0),
        axis_title_x=element_text(size=0.0),
        axis_text_x=element_text(size=18.0, family=['Dejavu Sans', 'Dejavu']),
        legend_text=element_text(family=['Dejavu Sans', 'Dejavu']),
        legend_title=element_blank(),
    )
)

poster_differences_plot_3mo.save("poster_differences_white_41x5_3mo.png", width=41, height=5, dpi=300, limitsize=False)

## Explore differences

In [0]:
# Try to graph differences for all heroes, manhattan distance

all_differences_by_day_manhattan = all_days_difference(df_expl_graph, distance_function=distance.cityblock).items()
sorted_all_differences_by_day_manhattan = sorted(all_differences_by_day_manhattan, key=lambda x: x[1], reverse=True)
df_all_differences_by_day_manhattan = pd.DataFrame(all_differences_by_day_manhattan)
df_all_differences_by_day_manhattan.columns = ["Day", "Difference"]

In [15]:
sorted_all_differences_by_day_manhattan

[('2013-12-12', 0.9750055687168958),
 ('2014-02-05', 0.8242279484963592),
 ('2013-12-13', 0.6596366828661892),
 ('2013-11-14', 0.5139691979753223),
 ('2012-10-30', 0.4349738922191271),
 ('2014-02-11', 0.4316920136002234),
 ('2013-12-14', 0.42028029611226125),
 ('2013-11-15', 0.40513437996322454),
 ('2012-06-11', 0.3930933036353869),
 ('2013-11-16', 0.3361665664843392),
 ('2014-02-07', 0.3349776515566302),
 ('2012-10-31', 0.3348857817148077),
 ('2012-01-12', 0.3204836427595646),
 ('2012-12-19', 0.2952161439513207),
 ('2013-11-17', 0.29317449221044134),
 ('2015-03-02', 0.2879601839421128),
 ('2012-12-20', 0.27582265690682023),
 ('2015-03-03', 0.2702156429791214),
 ('2013-12-15', 0.2634679822967624),
 ('2012-06-12', 0.2626602025407133),
 ('2012-07-26', 0.2600311899981306),
 ('2012-11-01', 0.25650466006422834),
 ('2015-12-16', 0.255933331217879),
 ('2015-03-04', 0.24957686979179722),
 ('2014-02-14', 0.24790388494342264),
 ('2015-02-18', 0.24633238752617873),
 ('2015-02-17', 0.2401710902010