In [57]:
import pandas as pd
import numpy as np
import os
import sys
import requests
import matplotlib.pyplot as plt

from numpy import *

data_path = r'../data'
data_file = os.path.join(data_path, 'train.csv')

df = pd.read_csv(data_file, encoding='utf-8')

In [59]:
"""
    major,
    vote_avg,
    genre|major,
    us|major,
    cast_popularity|budget,
    budget|major,genre
    vote_count_community|movie_popularity
    vote_count_critics|movie_popularity
    revenue|movie_popularity
    
    this we don't have:
        movie_popularity|genre, us, cast_popularity, vote_avg_community, vote_avg_critics
"""
groups = [
    df.groupby('major'),
    df.groupby('vote_average_binned'),
    df.groupby(['major', 'macro_genre']),
    df.groupby(['major', 'us']),
    df.groupby(['budget_binned', 'cast_popularity_binned']),
    df.groupby(['major', 'macro_genre', 'budget_binned'])
]

# we don't have this, we want to make inference about it
#moviepop_groupby = df.groupby(['macro_genre', 'us', 'cast_popularity_category', 'vote_average_category'])

# compute the cpt for each group. 
# Each row in the grouping is a combination for the conditioning vars, with the last column being conditioned.
for g in groups:
    # Retrieve counts for each grouping 
    group_count = g.size()
    names = group_count.index.names
    
    # see https://stackoverflow.com/questions/42854801/including-missing-combinations-of-values-in-a-pandas-groupby-aggregation
    # we need to unstack each and every level to account for 0-count subgroups
    # First unstack every subgroup and substitute missing values with 0, then put everything back.
    for lvl in range(1, len(names)):
        group_count = group_count.unstack(fill_value=0)
        
    for lvl in range(1, len(names)):
        group_count = group_count.stack()
        
    if(len(names) > 1):
        # group by all but the last column (the one we're conditioning on). Compute probabilities as the ratio
        # of the subgroup count/the total for the previous group
        levels = list(range(0, len(names)-1))
        conditional = group_count.groupby(level=levels).apply(lambda subg: subg/subg.sum())
#         joint = group_count/len(df)
        filename = '%s-%s.csv' % (names[-1], ','.join(names[0:-1]))
    else:
        conditional = group_count/group_count.sum()
#         joint = conditional
        filename = '%s.csv' % names[0]
        
    # Save cpt to csv files. One file per cpt
    conditional[isnan(conditional)] = 0
    conditional.to_csv(os.path.join(data_path, 'cpt', filename), header=True, encoding='utf-8')
        
#     print('joint:\n')
#     print(joint)
    print('conditional:\n')
    print(conditional)
    print('\n')

conditional:

major
0    0.633909
1    0.366091
dtype: float64


conditional:

vote_average_binned
bad      0.071316
great    0.197612
ok       0.731071
dtype: float64


conditional:

major  macro_genre
0      action         0.357107
       dark           0.262506
       light          0.281823
       other          0.098564
1      action         0.451973
       dark           0.210978
       light          0.273585
       other          0.063465
dtype: float64


conditional:

major  us
0      0     0.129767
       1     0.870233
1      0     0.030875
       1     0.969125
dtype: float64


conditional:

budget_binned  cast_popularity_binned
avg            1st                       0.974359
               2nd                       0.025641
               3rd                       0.000000
high           1st                       0.851636
               2nd                       0.139019
               3rd                       0.009346
low            1st                       0.994798
 