In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patheffects as pe
%matplotlib inline
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import re
import warnings
warnings.filterwarnings('ignore')

def minMaxScale(data, imposed_minimum = None, imposed_maximum = None, fit = False): #min max scaling
    mini = min(data)
    maxi = max(data)
    
    if (imposed_minimum != None): mini = imposed_minimum
    if (imposed_maximum != None): maxi = imposed_maximum
    if (fit == False):
        return (data - mini)/(maxi - mini)
    else: 
        return [mini, maxi]

In [2]:
#importing our clean data
data = pd.read_csv('../data/data.csv', index_col = 0)

In [3]:
display(data.sample())

Unnamed: 0,Actor_name,Actor_gender,Actor_date_of_birth,Movie_name,Movie_release_date,Movie_box_office_revenue,Movie_runtime,averageRating,numVotes,genres,countries,box_office_adj
20209,Madeleine Potter,F,1964,Hello Again,1987.0,20419446.0,96.0,6.54,4326,"[' ""Romantic comedy""', ' ""Heaven-Can-Wait Fant...","[' ""United States of America""}']",43445630.0


# Star score calculation

first we need to deflate the imdb ratings by the amount of votes they've gotten

In [4]:
data['numVotes_factor'] = minMaxScale(np.log(data['numVotes']), 0)
data['averageRating_adj'] =  data['averageRating']* data['numVotes_factor']

In [5]:
data['actor'] = data["Actor_name"].apply(lambda x : '_'.join(x.split())) #simplifying actor names

actorMetrics = pd.DataFrame(data['actor'].drop_duplicates().reset_index(drop = True)) #creation of starscore dataf

timeScores = pd.DataFrame(data['actor'].drop_duplicates().reset_index(drop = True))

In [7]:
metrics_avg = { 'box_office_adj' : 'revenue_avg', 'averageRating_adj' : 'grade_avg'}
metrics_sum = { 'box_office_adj' : 'revenue_sum', 'averageRating_adj' : 'grade_sum'}

for idx, (key, value) in enumerate(metrics_avg.items()):
    print(idx)
    lists_per_actor = data.groupby('actor').apply(lambda x : np.mean(x[key])).reindex(
        index = actorMetrics['actor'].values)
    actorMetrics[value] = lists_per_actor.values
       
for idx, (key, value) in enumerate(metrics_sum.items()):
    print(idx)
    lists_per_actor = data.groupby('actor').apply(lambda x : sum(x[key])).reindex(
        index = actorMetrics['actor'].values)
    actorMetrics[value] = lists_per_actor.values
    
    
appearances_per_actor = data.groupby('actor')['Movie_name'].count().reindex(
    index = actorMetrics['actor'].values).reset_index(drop = True)
actorMetrics['appearances'] = appearances_per_actor

0
1
0
1


In [8]:
longitudinal_interest_columns = {'box_office_adj': 'revenues',
                                 'Movie_release_date': 'movie_years', 'averageRating_adj': 'ratings'}

for idx, (key, value) in enumerate(longitudinal_interest_columns.items()):
    
    lists_per_actor = data.groupby('actor').apply(lambda x : pd.Series({key: x[key].values})).reindex(
        index = timeScores['actor'].values)
    timeScores[value] = lists_per_actor.values
    
#for col in longitudinal_interest_columns:
#  per_month[col] = selected_T4_ds.groupby('date_transformed').apply(lambda x: pd.Series({col: x[col].values }))

In [9]:
#to access a list :
#star_scores.query('actor == "Bruce_Willis"')['cumulative_revenue'].tolist()[0]
timeScores['cumulative_revenue'] = timeScores['revenues'].apply(lambda x : np.cumsum(x.tolist()))
timeScores['cumulative_ratings'] = timeScores['ratings'].apply(lambda x : np.cumsum(x.tolist()))

Example

In [10]:
timeScores.query('actor == "Bruce_Willis"')

Unnamed: 0,actor,revenues,movie_years,ratings,cumulative_revenue,cumulative_ratings
6221,Bruce_Willis,"[134943125.0, 83663223.40425533, 287281542.857...","[1982.0, 1987.0, 1988.0, 1988.0, 1989.0, 1989....","[5.234449774801497, 4.962270978137284, 6.85023...","[134943125.0, 218606348.40425533, 505887891.26...","[5.234449774801497, 10.19672075293878, 17.0469..."


In [11]:
timeScores['star_score'] = timeScores.apply(lambda x : np.log(x['cumulative_revenue'] * x['cumulative_ratings']),
                                            axis = 1)

In [12]:
timeScores.query('actor == "Bruce_Willis"').star_score.tolist()[0]

array([20.37562569, 21.52484935, 22.87779702, 23.16592681, 24.12066325,
       24.26186786, 24.43932385, 24.56774033, 25.03784568, 25.2167668 ,
       25.30309363, 25.38607554, 25.46254964, 25.68328199, 25.77828745,
       26.036364  , 26.09157519, 26.17052271, 26.23243044, 26.31510454,
       26.5161436 , 26.64345161, 26.91925832, 27.00653395, 27.0804974 ,
       27.11396059, 27.16636472, 27.40796109, 27.51062805, 27.57391951,
       27.63694881, 27.68006898, 27.71417254, 27.74477298, 27.79129007,
       27.86342092, 27.88950847, 27.98411682, 27.99370224, 28.03477429,
       28.08933249, 28.11001812, 28.14921902, 28.18301391, 28.25491294,
       28.27493007, 28.30365168, 28.38355376, 28.41081851, 28.43201317,
       28.45794694, 28.47418312, 28.51160685, 28.53836884, 28.57960625,
       28.63199162, 28.65012713, 28.68002949, 28.71376861, 28.76268324])

The formula for the star score is given below.

$star\ score = log\left(\overline{revenue} \cdot \overline{ratings_{normalized}}\right)$

To be noted that the star scored is defined on the 'career' of the actor, namely it is based on the average of the identified parameters over all his/her movies.

In [19]:
#log product = sum of log, we are using this because we have 2 very heavy tailed distributions

actorMetrics['score_sum'] = np.log(actorMetrics['revenue_sum'] * actorMetrics['grade_sum']) 
epfl_scaler = minMaxScale(actorMetrics['score_sum'], fit = True)
actorMetrics['EPFLsum'] = (minMaxScale(actorMetrics['score_sum']) * 5) + 1 #min max scaled to epfl standards
actorMetrics.head(3)

Unnamed: 0,actor,revenue_avg,grade_avg,revenue_sum,grade_sum,appearances,score_sum,scoreEPFL_sum,EPFLsum
0,Dustin_Farnum,6117500.0,2.861536,6117500.0,2.861536,1,16.678023,2.697314,2.697314
1,Elmer_Clifton,1250000000.0,4.791714,1250000000.0,4.791714,1,22.513298,4.217113,4.217113
2,Robert_Harron,1250000000.0,4.791714,1250000000.0,4.791714,1,22.513298,4.217113,4.217113


In [21]:
timeScores['EPFLsum'] = timeScores['star_score'].apply(
    lambda x: (5 * minMaxScale(x, epfl_scaler[0], epfl_scaler[1])) + 1)

In [22]:
display(timeScores.query('actor == "Bruce_Willis"').epfl_score.tolist()[0][-1],
        actorMetrics.query('actor == "Bruce_Willis"')['EPFLsum'])

5.844767263785556

6221    5.844767
Name: EPFLsum, dtype: float64

In [23]:
# lets get better parse on genres 
data['genres_corrected'] = data['genres'].apply(lambda x: str(x).split('"')[1::2])

In [26]:
list_to_join = data['genres_corrected'].values.tolist() #total genres not unique

`Main genre per actor`

In [27]:
genres_per_actor = data.groupby('actor').apply(
    lambda x :pd.Series({'genres': x['genres_corrected'].values})).reindex(index = timeScores['actor'].values)

timeScores['genres'] = genres_per_actor.values

In [28]:
genres_per_actor['genre_total'] = genres_per_actor['genres'].apply(lambda x : sum(x, [])).apply(
    lambda x : ['NaN'] if len(x) == 0 else x)

In [29]:
genres_per_actor['count_values'] = genres_per_actor['genre_total'].apply(
    lambda x : pd.DataFrame(pd.DataFrame({'gen' : x}).groupby(by = 'gen').size(), 
    columns = ['count']).sort_values(by = 'count', ascending = False)['count'].values)

In [30]:
genres_per_actor['unique_genres_order'] = genres_per_actor['genre_total'].apply(
    lambda x : pd.DataFrame(pd.DataFrame({'gen' : x}).groupby(by = 'gen').size(), 
    columns = ['count']).sort_values(by = 'count', ascending = False).index.tolist())

In [31]:
genres_per_actor['main'] = genres_per_actor['unique_genres_order'].apply(lambda x : x[0])
genres_per_actor['secondary'] = genres_per_actor['unique_genres_order'].apply(
    lambda x : x[1] if len(x) > 1 else None)

In [32]:
genres_to_merge = genres_per_actor.reset_index(drop = True)

In [33]:
#train a decision tree on the best actors, or hand made labels to get more interesting genres than costume drama

>make group similar genres and reduce them

In [36]:
timeScores['genres'] = genres_to_merge['unique_genres_order']
timeScores['genre_count'] = genres_to_merge['count_values']
timeScores['main_genre'] = genres_to_merge['main']
timeScores['2nd_genre'] = genres_to_merge['secondary']

In [37]:
timeScores.head(3)

Unnamed: 0,actor,revenues,movie_years,ratings,cumulative_revenue,cumulative_ratings,star_score,epfl_score,EPFLsum,genres,genre_count,main_genre,2nd_genre
0,Dustin_Farnum,[6117500.0],[1914.0],[2.861536471788676],[6117500.0],[2.861536471788676],[16.67802278277362],[2.6973142737054587],[2.6973142737054587],"[Black-and-white, Drama, Indie, Silent film, W...","[1, 1, 1, 1, 1]",Black-and-white,Drama
1,Elmer_Clifton,[1250000000.0],[1915.0],[4.791714035959664],[1250000000.0],[4.791714035959664],[22.513297571992556],[4.217113125596666],[4.217113125596666],"[Black-and-white, Costume drama, Drama, Epic, ...","[1, 1, 1, 1, 1, 1, 1]",Black-and-white,Costume drama
2,Robert_Harron,[1250000000.0],[1915.0],[4.791714035959664],[1250000000.0],[4.791714035959664],[22.513297571992556],[4.217113125596666],[4.217113125596666],"[Black-and-white, Costume drama, Drama, Epic, ...","[1, 1, 1, 1, 1, 1, 1]",Black-and-white,Costume drama


In [38]:
genres_per_actor.query('actor == "Bruce_Willis"')

Unnamed: 0_level_0,genres,genre_total,count_values,unique_genres_order,main,secondary
actor,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Bruce_Willis,"[[Film adaptation, Drama, Courtroom Drama], [R...","[Film adaptation, Drama, Courtroom Drama, Roma...","[32, 28, 26, 23, 23, 22, 14, 9, 7, 6, 6, 6, 6,...","[Thriller, Action, Drama, Comedy, Crime Fictio...",Thriller,Action


## data saving & loading fixes

In [41]:
timeScores.to_csv('../data/timeScores.csv', sep = '\t')
actorMetrics.to_csv('../data/actorMetrics.csv')

In [92]:
savedTimeScores = pd.read_csv('../data/timeScores.csv', sep = '\t', index_col = 0)

In [93]:
display(savedTimeScores.query('actor == "Bruce_Willis"')['genres'].values.tolist()[0])
usual_suspects = timeScores.columns.tolist()[1: -2]
print(usual_suspects)

'[\'Thriller\', \'Action\', \'Drama\', \'Comedy\', \'Crime Fiction\', \'Action/Adventure\', \'Crime Thriller\', \'Mystery\', \'Adventure\', \'Romance Film\', \'Science Fiction\', \'Black comedy\', \'Gangster Film\', \'Action Thrillers\', \'Crime Comedy\', \'Suspense\', \'Family Film\', \'Comedy-drama\', \'Psychological thriller\', \'Buddy film\', \'Romantic comedy\', \'Period piece\', \'War film\', \'Indie\', \'Ensemble Film\', \'Fantasy\', \'Coming of age\', \'Film adaptation\', \'Horror\', \'Political drama\', \'Parody\', \'Heist\', \'Slapstick\', \'Marriage Drama\', \'Supernatural\', \'Animation\', \'Detective\', \'Action Comedy\', \'Courtroom Drama\', \'Western\', \'Americana\', \'Superhero movie\', "Children\\\\\'s/Family", \'Satire\', \'Teen\', \'Caper story\', \'Neo-noir\', \'Tragicomedy\', \'Romantic drama\', \'Time travel\', \'Political thriller\', \'Superhero\', \'Slasher\', \'Martial Arts Film\', \'Absurdism\', \'Erotic thriller\', \'Jungle Film\', \'Computer Animation\', \'

['revenues', 'movie_years', 'ratings', 'cumulative_revenue', 'cumulative_ratings', 'star_score', 'epfl_score', 'EPFLsum', 'genres', 'genre_count']


In [98]:
def save_values(df, columns):

    for col in columns:
        if (col != 'genres'):
            df[col] = df[col].apply(lambda x : x[1:-1].split(' '))
        else:
            df[col] = df[col].apply(lambda x : x.split("\'")[1::2])
    
    return df

In [95]:
fixed = save_values(savedTimeScores, usual_suspects)

In [None]:
#subgenres
#drop low frequencies
#group genres 

In [97]:
#birthdate & directors

## networking

In [95]:
network = data[['actor', 'Movie_name']].copy()
network.head()

Unnamed: 0,actor,Movie_name
0,Dustin_Farnum,The Squaw Man
1,Elmer_Clifton,The Birth of a Nation
2,Robert_Harron,The Birth of a Nation
3,Walter_Long,The Birth of a Nation
4,Violet_Wilkey,The Birth of a Nation


In [96]:
frequencies = network.merge(
    network, on = 'Movie_name').groupby(['actor_x','actor_y']).size().unstack(
    'actor_y', fill_value = 0)

In [97]:
frequencies

actor_y,40_Glocc,50_Cent,A.D._Miles,A._J._Benza,A._J._Bowen,A._J._Buckley,A._J._Cook,A._J._Langer,A._Michael_Baldwin,A_Martinez,...,Özgü_Namal,Özgür_Çevik,Þröstur_Leó_Gunnarsson,İdil_Fırat,İsmail_Hacıoğlu,Şafak_Sezer,Şebnem_Dönmez,Željko_Ivanek,佐々木望,田村英里子
actor_x,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40_Glocc,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
50_Cent,0,6,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A.D._Miles,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A._J._Benza,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A._J._Bowen,0,0,0,0,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Şafak_Sezer,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,4,0,0,0,0
Şebnem_Dönmez,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,1,0,0,0
Željko_Ivanek,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,17,0,0
佐々木望,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [98]:
degrees = frequencies.astype('bool').sum(axis = 0) - 1

In [78]:
len(timeScores['actor'].values)

21924

In [None]:
star_scores

Unnamed: 0,actor,rev_sum,grade_sum,appearances,score,score_epfl
0,Dustin_Farnum,6.117500e+06,2.861536,1.0,16.678023,2.697314
1,Elmer_Clifton,1.250000e+09,4.791714,1.0,22.513298,4.217113
2,Robert_Harron,1.250000e+09,4.791714,1.0,22.513298,4.217113
3,Walter_Long,1.250000e+09,4.791714,1.0,22.513298,4.217113
4,Violet_Wilkey,1.250000e+09,4.791714,1.0,22.513298,4.217113
...,...,...,...,...,...,...
21919,Kathleen_Rose_Perkins,4.254150e+06,4.961153,1.0,16.865044,2.746024
21920,Bo_Barrett,4.254150e+06,4.961153,1.0,16.865044,2.746024
21921,Odeya_Rush,5.256953e+07,5.482188,1.0,19.479151,3.426869
21922,Kendall_Ryan_Sanders,5.256953e+07,5.482188,1.0,19.479151,3.426869


In [99]:
degrees_ordered = pd.DataFrame(degrees, columns = ['degree']).reindex(star_scores['actor'].values).reset_index()

In [101]:
star_scores['degree'] = degrees_ordered['degree']

In [51]:
import networkx as nx

In [104]:
Graph = nx.Graph()
Graph.add_nodes_from(frequencies.index)


for i in range(len(frequencies)):
    if (i%1000 == 0): print(i)
    for j in range(len(frequencies)):
        if (frequencies.iloc[i, j] != 0) & (i != j):
            Graph.add_edge(i, j, weight = frequencies.iloc[i, j])

'Graph = nx.Graph()\nGraph.add_nodes_from(frequencies.index)\n\n\nfor i in range(len(frequencies)):\n    if (i%1000 == 0): print(i)\n    for j in range(len(frequencies)):\n        if (frequencies.iloc[i, j] != 0) & (i != j):\n            Graph.add_edge(i, j, weight = frequencies.iloc[i, j])'

## HUGE CHUNGUS DOWN BELOW (genres)

In [38]:
genres_list = sum(list_to_join, [])

In [49]:
df_genres = pd.DataFrame({'genres' : genres_list}) #len 433190

In [50]:
df_genres.drop_duplicates() #308 genres

Unnamed: 0,genres
0,Silent film
1,Western
2,Drama
3,Indie
4,Black-and-white
...,...
396581,Nature
414077,News
414421,Fictional film
417054,Bloopers & Candid Camera


In [58]:
genre_occurences = pd.DataFrame(df_genres.groupby(['genres']).size(), columns = ['count'])

In [59]:
genre_occurences['count_percentage'] = genre_occurences['count']/len(df_genres)
display(genre_occurences)

Unnamed: 0_level_0,count,count_percentage
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Absurdism,732,0.001690
Acid western,29,0.000067
Action,18710,0.043191
Action Comedy,916,0.002115
Action Thrillers,2103,0.004855
...,...,...
Workplace Comedy,506,0.001168
World History,3,0.000007
World cinema,3450,0.007964
Wuxia,46,0.000106


In [66]:
genre_occurences.sort_values(by ='count_percentage', ascending = False).head(50)

Unnamed: 0_level_0,count,count_percentage
genres,Unnamed: 1_level_1,Unnamed: 2_level_1
Drama,41556,0.09593
Comedy,33827,0.078088
Thriller,20337,0.046947
Romance Film,20180,0.046585
Action,18710,0.043191
Action/Adventure,14525,0.03353
Crime Fiction,13189,0.030446
Adventure,12871,0.029712
Family Film,9574,0.022101
Romantic comedy,9498,0.021926
