<a href="https://colab.research.google.com/github/mnijhuis-dnb/open_source_workshop/blob/master/Bechdel_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Load the packages, including the urllib & json for getting the data

In [None]:
import urllib.request, json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

retrieve the data from the JSON and read it in a dataframe

In [None]:
df = pd.read_json('http://bechdeltest.com/api/v1/getAllMovies')
# if using the offline data
# df = pd.read_json('Bechdel.json')

Show what is in the data

In [None]:
df.head()

Make a pivot table of the data

In [None]:
df_pivot = pd.pivot_table(df, values='id', index='year', columns='rating', aggfunc='count').fillna(0)

Show the resulting table

In [None]:
df_pivot.head()

Try and plot the data

In [None]:
df_pivot.plot()

Make a integer which indicates which decade we are in

In [None]:
df_pivot['year10'] = np.floor(np.array(df_pivot.index)/10)

Filter on the decades with enough data

In [None]:
df_pivot = df_pivot[(df_pivot['year10']>182) & (df_pivot['year10']<212)]

Make a pivot table to compound the data

In [None]:
df10 = df_pivot.groupby(by='year10').sum()

Show the resulting data

In [None]:
df10

Change from numbers to a percentage of the movies of each decade

In [None]:
df10 = df10.apply(lambda x: x/df10.sum(axis=1), axis=0)

Make a plot

In [None]:
# Generate a figure with 9 subplots, with shared x and y
fig, ax = plt.subplots(nrows = 3, ncols = 3, tight_layout=True, figsize=(16,8), sharex=True, sharey=True)

# Make 4 colors for the plot 
colors = [(0, 85/255, 102/255),(0, 138/255, 166/255),(0, 170/255, 204/255),(0, 192/255, 232/255)]

# Set the labels for the 4 categories
labels = ['Less than 2 women','2 or more women','2 women have a conversation','2 women have a conversation not about a man']

# Loop over the subplots
for idx, axes in enumerate(ax.flatten()):
  # Plot a horizontal bar for each of the decadeds
  bars = axes.barh(df10.columns, width=df10.iloc[idx], height=0.9, color = colors)
  # Set the title of each subplot to the decade
  axes.set_title(f'{int(df10.index[idx]*10)}\'s')
  # Remove the spines
  axes.spines["right"].set_visible(False) 
  axes.spines["top"].set_visible(False) 
  axes.spines["bottom"].set_visible(False) 
  # Remove the ticks
  axes.tick_params(top=False, 
               bottom=False, 
               left=False, 
               right=False, 
               labelleft=False, 
               labelbottom=False)
  # Add a percentage to the bars
  for bar in bars:
        text_x = bar.get_width() - 0.05
        text_y = bar.get_y() + bar.get_height() / 2
        axes.text(text_x, 
                text_y, 
                '{:.0%}'.format(bar.get_width()), 
                va='center', 
                color='white', 
                fontsize=12)        

# Make a legend
handles = [plt.Rectangle((0,0),1,1, color = color_label) for color_label in colors]
fig.legend(handles, labels, loc='center', bbox_to_anchor=(0.5,-.05), ncol=4, prop={'size': 14}, bbox_transform = plt.gcf().transFigure)