In [2]:
import os
import re
import json
import glob
import pandas as pd
import numpy as np
import utils
import cleaning
import analysis

In [3]:
act_df = pd.read_json(os.path.join('cleanData', 'imdb.title.principals.json.gz'))
act_df = act_df.query('category == "actor"')
act_df

Unnamed: 0,tconst,ordering,nconst,category
0,tt0111414,1,nm0246005,actor
5,tt0323808,2,nm2694680,actor
6,tt0323808,3,nm0574615,actor
14,tt0417610,1,nm0532721,actor
16,tt0417610,3,nm0069209,actor
...,...,...,...,...
1028175,tt9681728,9,nm10397910,actor
1028176,tt9689618,1,nm10439726,actor
1028177,tt9689618,2,nm10439727,actor
1028178,tt9689618,3,nm10439724,actor


In [4]:
name_df = pd.read_csv(os.path.join('zippedData', 'imdb.name.basics.csv.gz'))
name_df = name_df.loc[:, ['nconst', 'primary_name']]
name_df

Unnamed: 0,nconst,primary_name
0,nm0061671,Mary Ellen Bauder
1,nm0061865,Joseph Bauer
2,nm0062070,Bruce Baum
3,nm0062195,Axel Baumann
4,nm0062798,Pete Baxter
...,...,...
606643,nm9990381,Susan Grobes
606644,nm9990690,Joo Yeon So
606645,nm9991320,Madeline Smith
606646,nm9991786,Michelle Modigliani


In [5]:
act_df = act_df.merge(name_df, how='left', on=['nconst'])
act_df

Unnamed: 0,tconst,ordering,nconst,category,primary_name
0,tt0111414,1,nm0246005,actor,Tommy Dysart
1,tt0323808,2,nm2694680,actor,Henry Garrett
2,tt0323808,3,nm0574615,actor,Graham McTavish
3,tt0417610,1,nm0532721,actor,Luis Machín
4,tt0417610,3,nm0069209,actor,Carlos Belloso
...,...,...,...,...,...
256713,tt9681728,9,nm10397910,actor,Danil Tyabin
256714,tt9689618,1,nm10439726,actor,Christian Lacroix
256715,tt9689618,2,nm10439727,actor,Christine Spengler
256716,tt9689618,3,nm10439724,actor,Phillippe Warner


In [6]:
imdb_tn = pd.read_json(os.path.join('cleanData', 'imdb.tn.profit.json.gz'), convert_dates=['release_date'], date_unit='ms')
imdb_tn

Unnamed: 0,tconst,title,release_year,runtime_minutes,genres,release_date,production_budget,domestic_gross,worldwide_gross,tn_id,worldwide_profit,domestic_profit
0,tt0359950,The Secret Life of Walter Mitty,2013,114.0,"[Adventure, Comedy, Drama]",2013-12-25,91000000,58236838,187861183,436,96861183,-32763162
1,tt0365907,A Walk Among the Tombstones,2014,114.0,"[Action, Crime, Drama]",2014-09-19,28000000,26017685,62108587,2066,34108587,-1982315
2,tt0369610,Jurassic World,2015,124.0,"[Action, Adventure, Sci-Fi]",2015-06-12,215000000,652270625,1648854864,33,1433854864,437270625
3,tt0376136,The Rum Diary,2011,119.0,"[Comedy, Drama]",2011-10-28,45000000,13109815,21544732,1315,-23455268,-31890185
4,tt0383010,The Three Stooges,2012,92.0,"[Comedy, Family]",2012-04-13,30000000,44338224,54052249,1903,24052249,14338224
...,...,...,...,...,...,...,...,...,...,...,...,...
1417,tt7958736,Ma,2019,99.0,"[Horror, Thriller]",2019-05-31,5000000,36049540,44300625,4265,39300625,31049540
1418,tt7959026,The Mule,2018,116.0,"[Crime, Drama, Thriller]",2018-12-14,50000000,103804407,170857676,1099,120857676,53804407
1419,tt8155288,Happy Death Day 2U,2019,100.0,"[Drama, Horror, Mystery]",2019-02-13,9000000,28051045,64179495,3777,55179495,19051045
1420,tt8632862,Fahrenheit 11/9,2018,128.0,[Documentary],2018-09-21,5000000,6352306,6653715,4328,1653715,1352306


In [7]:
imdb_tn['profit_decile'] = pd.qcut(imdb_tn['worldwide_profit'], 10, labels=range(10))

In [8]:
profit_df = imdb_tn.loc[:, ['tconst', 'title', 'domestic_profit', 'worldwide_profit', 'profit_decile']]
act_df = act_df.merge(profit_df, how='inner', on=['tconst'])
act_df

Unnamed: 0,tconst,ordering,nconst,category,primary_name,title,domestic_profit,worldwide_profit,profit_decile
0,tt0475290,1,nm0000982,actor,Josh Brolin,"Hail, Caesar!",8080225,42160680,5
1,tt0475290,2,nm0000123,actor,George Clooney,"Hail, Caesar!",8080225,42160680,5
2,tt0475290,3,nm2403277,actor,Alden Ehrenreich,"Hail, Caesar!",8080225,42160680,5
3,tt0475290,4,nm0000146,actor,Ralph Fiennes,"Hail, Caesar!",8080225,42160680,5
4,tt1124037,1,nm0000190,actor,Matthew McConaughey,Free State of Jones,-29189964,-26762748,0
...,...,...,...,...,...,...,...,...,...
3461,tt6266538,1,nm0000288,actor,Christian Bale,Vice,-12163718,10883171,3
3462,tt6266538,3,nm0136797,actor,Steve Carell,Vice,-12163718,10883171,3
3463,tt6266538,4,nm0005377,actor,Sam Rockwell,Vice,-12163718,10883171,3
3464,tt6628394,1,nm0000313,actor,Jeff Bridges,Bad Times at the El Royale,-14160885,-2795621,1


In [None]:
combos2 = analysis.combo_table(act_df, 'tconst', 'nconst')
combos2

In [19]:
combos = pd.DataFrame(index=act_df.tconst.unique(), columns=act_df.nconst.unique())
combos

Unnamed: 0,nm0000982,nm0000123,nm2403277,nm0000146,nm0000190,nm0991810,nm0328828,nm0004395,nm0001475,nm1706767,...,nm3450051,nm7059572,nm5618577,nm5237225,nm8722479,nm0012081,nm0710330,nm3982894,nm6676262,nm6561765
tt0475290,,,,,,,,,,,...,,,,,,,,,,
tt1124037,,,,,,,,,,,...,,,,,,,,,,
tt1216492,,,,,,,,,,,...,,,,,,,,,,
tt1226229,,,,,,,,,,,...,,,,,,,,,,
tt1251743,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt7634968,,,,,,,,,,,...,,,,,,,,,,
tt7153766,,,,,,,,,,,...,,,,,,,,,,
tt7752126,,,,,,,,,,,...,,,,,,,,,,
tt6266538,,,,,,,,,,,...,,,,,,,,,,


In [44]:
id_mapping = act_df.set_index('tconst')['nconst']
id_mapping

tconst
tt0475290    nm0000982
tt0475290    nm0000123
tt0475290    nm2403277
tt0475290    nm0000146
tt1124037    nm0000190
               ...    
tt6266538    nm0000288
tt6266538    nm0136797
tt6266538    nm0005377
tt6628394    nm0000313
tt6628394    nm0358316
Name: nconst, Length: 3466, dtype: object

In [42]:
for index, row in combos.iterrows():
    cats_at_index = id_mapping[index]
    if isinstance(cats_at_index, pd.Series):
        cats_at_index = id_mapping[index].values
        combos.loc[index] = row.index.isin(cats_at_index)
    else:
        combos.loc[index] = row.index == cats_at_index
combos

Unnamed: 0,nm0000982,nm0000123,nm2403277,nm0000146,nm0000190,nm0991810,nm0328828,nm0004395,nm0001475,nm1706767,...,nm3450051,nm7059572,nm5618577,nm5237225,nm8722479,nm0012081,nm0710330,nm3982894,nm6676262,nm6561765
tt0475290,True,True,True,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt1124037,False,False,False,False,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt1216492,False,False,False,False,False,False,True,True,True,False,...,False,False,False,False,False,False,False,False,False,False
tt1226229,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
tt1251743,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
tt7634968,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
tt7153766,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False
tt7752126,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,True
tt6266538,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
