In [9]:
import pandas as pd
import numpy as np
import os
import requests
import matplotlib.pyplot as plt

data_path = r'../data'
data_file = os.path.join(data_path, 'train.csv')
out_file = os.path.join(data_path, 'train2.csv')

df = pd.read_csv(data_file, encoding='utf-8')

In [74]:
"""
    major,
    genre,
    cast_popularity,
    vote_avg,
    us|major,
    cast_popularity|genre
    budget|major, genre, cast_popularity
    movie_popularity|genre, us, cast_popularity, vote_avg_community, vote_avg_critics
    vote_count_community|movie_popularity
    vote_count_critics|movie_popularity
    revenue|movie_popularity
"""
groups = [
    df.groupby('major'),
    df.groupby('vote_average_binned'),
    df.groupby('cast_popularity_binned'),
    df.groupby('macro_genre'),
    df.groupby(['major', 'macro_genre']),
    df.groupby(['major', 'us']),
    df.groupby(['macro_genre', 'cast_popularity_binned']),
    df.groupby(['major', 'cast_popularity_binned']),
    df.groupby(['major', 'macro_genre', 'cast_popularity_binned', 'budget_binned'])
]

# we don't have this, we want to make inference about it
#moviepop_groupby = df.groupby(['macro_genre', 'us', 'cast_popularity_category', 'vote_average_category'])
# group_count = groups[8].size()
# lvls = list(range(0, len(names)-1))
#     for lvl in lvls:
#         group_count.unstack(fill_value=0).stack()
# print(.unstack(fill_value=0).stack())
# return

# compute the list of cpts for each group
for g in groups:
    group_count = g.size()
    names = group_count.index.names
    
    # see https://stackoverflow.com/questions/42854801/including-missing-combinations-of-values-in-a-pandas-groupby-aggregation
    # we need to unstack each and every level to account for 0-count subgroups
    for lvl in range(1, len(names)):
        group_count = group_count.unstack(level=-lvl,fill_value=0).stack()
    
    if(len(names) > 1):
        # group by all but the last column (the one we're conditioning on)
        levels = list(range(0, len(names)-1))
        conditional = group_count.groupby(level=levels).apply(lambda subg: subg/subg.sum())
#         joint = group_count/len(df)
        filename = '%s-%s.csv' % (names[-1], ','.join(names[0:-1]))
    else:
        conditional = group_count/group_count.sum()
#         joint = conditional
        filename = '%s.csv' % names[0]
        
    # Save cpt to csv files. One file per cpt
    conditional.to_csv(os.path.join(data_path, 'cpt', filename), header=True, encoding='utf-8')
        
#     print('joint:\n')
#     print(joint)
    print('conditional:\n')
    print(conditional)
    print('\n')

conditional:

major
0    0.633909
1    0.366091
dtype: float64


conditional:

vote_average_binned
bad      0.071316
great    0.197612
ok       0.731071
dtype: float64


conditional:

cast_popularity_binned
1st    0.946311
2nd    0.050863
3rd    0.002826
dtype: float64


conditional:

macro_genre
action    0.393407
dark      0.244898
drama     0.062166
light     0.285714
other     0.013815
dtype: float64


conditional:

major  macro_genre
0      action         0.358593
       dark           0.264487
       drama          0.070332
       light          0.290243
       other          0.016345
1      action         0.453688
       dark           0.210978
       drama          0.048027
       light          0.277873
       other          0.009434
dtype: float64


conditional:

major  us
0      0     0.129767
       1     0.870233
1      0     0.030875
       1     0.969125
dtype: float64


conditional:

macro_genre  cast_popularity_binned
action       1st                       0.915403
   