In [1]:
import pandas as pd
import numpy as np
from functools import reduce

In [2]:
# Input 
csv_file = "output_Air Bud Seventh Inning Fetch (2002).csv"
csv_file_ace_ventura = "https://raw.githubusercontent.com/moe221/diversity_in_cinema/moe221_prediction_pipeline/results/ace_ventura_results.csv"
csv_file_A_team = "output_The A-Team (2010).csv"

In [3]:
df_movie = pd.read_csv(csv_file_ace_ventura)
df_movie

Unnamed: 0.1,Unnamed: 0,frame_number,face_id,gender,race
0,0,31.0,0.0,Man,white
1,0,51.0,1.0,Man,white
2,0,71.0,2.0,Man,black
3,0,91.0,3.0,Man,latino hispanic
4,0,101.0,4.0,Man,asian
...,...,...,...,...,...
1171,0,9741.0,1171.0,Man,black
1172,1,9741.0,1172.0,Man,black
1173,2,9741.0,1173.0,Woman,white
1174,0,9761.0,1174.0,Man,asian


In [4]:
df_movie = df_movie.drop(columns=['Unnamed: 0'])
df_movie

Unnamed: 0,frame_number,face_id,gender,race
0,31.0,0.0,Man,white
1,51.0,1.0,Man,white
2,71.0,2.0,Man,black
3,91.0,3.0,Man,latino hispanic
4,101.0,4.0,Man,asian
...,...,...,...,...
1171,9741.0,1171.0,Man,black
1172,9741.0,1172.0,Man,black
1173,9741.0,1173.0,Woman,white
1174,9761.0,1174.0,Man,asian


# Movie data

## Movie length

Number of genders and races

In [5]:
gender_number = pd.unique(df_movie['gender'])
gender_number

array(['Man', 'Woman'], dtype=object)

In [6]:
race_number = pd.unique(df_movie['race'])
race_number

array(['white', 'black', 'latino hispanic', 'asian', 'indian',
       'middle eastern'], dtype=object)

# Total number

## people

In [7]:
for gender in gender_number:
    print(len(df_movie[df_movie['gender'] == gender]))

954
222


In [8]:
for race in race_number:
    print(len(df_movie[df_movie['race'] == race]))

800
72
98
103
7
96


# Number of frames

In [9]:
df = df_movie
df

Unnamed: 0,frame_number,face_id,gender,race
0,31.0,0.0,Man,white
1,51.0,1.0,Man,white
2,71.0,2.0,Man,black
3,91.0,3.0,Man,latino hispanic
4,101.0,4.0,Man,asian
...,...,...,...,...
1171,9741.0,1171.0,Man,black
1172,9741.0,1172.0,Man,black
1173,9741.0,1173.0,Woman,white
1174,9761.0,1174.0,Man,asian


In [45]:
df.groupby(["frame_number"]).nunique()["gender"]

frame_number
31.0      1
51.0      1
71.0      1
91.0      1
101.0     1
         ..
9711.0    1
9731.0    2
9741.0    2
9761.0    1
9771.0    1
Name: gender, Length: 687, dtype: int64

In [11]:
df.loc[1170:1175, :]

Unnamed: 0,frame_number,face_id,gender,race
1170,9731.0,1170.0,Man,white
1171,9741.0,1171.0,Man,black
1172,9741.0,1172.0,Man,black
1173,9741.0,1173.0,Woman,white
1174,9761.0,1174.0,Man,asian
1175,9771.0,1175.0,Man,white


In [46]:
df2 = df.groupby(["frame_number"]).nunique()["gender"].reset_index()
df2

Unnamed: 0,frame_number,gender
0,31.0,1
1,51.0,1
2,71.0,1
3,91.0,1
4,101.0,1
...,...,...
682,9711.0,1
683,9731.0,2
684,9741.0,2
685,9761.0,1


In [13]:
df['gender']

0         Man
1         Man
2         Man
3         Man
4         Man
        ...  
1171      Man
1172      Man
1173    Woman
1174      Man
1175      Man
Name: gender, Length: 1176, dtype: object

In [47]:
df2['gender_value'] = df['gender']
df2

Unnamed: 0,frame_number,gender,gender_value
0,31.0,1,Man
1,51.0,1,Man
2,71.0,1,Man
3,91.0,1,Man
4,101.0,1,Man
...,...,...,...
682,9711.0,1,Woman
683,9731.0,2,Man
684,9741.0,2,Man
685,9761.0,1,Woman


In [15]:
df3 = df2[df2.gender == 1]
df3

Unnamed: 0,frame_number,gender,gender_value
0,31.0,1,Man
1,51.0,1,Man
2,71.0,1,Man
3,91.0,1,Man
4,101.0,1,Man
...,...,...,...
680,9691.0,1,Man
681,9701.0,1,Woman
682,9711.0,1,Woman
685,9761.0,1,Woman


In [16]:
df3.groupby('gender_value')['gender_value'].count()

gender_value
Man      433
Woman    108
Name: gender_value, dtype: int64

# Average screentime

## Gender

In [17]:
df_gender = df.groupby(['gender']).count().reset_index().drop(columns=['face_id','race'])
df_gender

Unnamed: 0,gender,frame_number
0,Man,954
1,Woman,222


In [18]:
df.groupby(['gender']).count()

Unnamed: 0_level_0,frame_number,face_id,race
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Man,954,954,954
Woman,222,222,222


In [19]:
df_gender['gender_screentime']=df_gender['frame_number']/df_gender['frame_number'].sum()*100
df_gender

Unnamed: 0,gender,frame_number,gender_screentime
0,Man,954,81.122449
1,Woman,222,18.877551


In [20]:
df_only = df3.groupby(['gender_value']).count().reset_index().drop(columns=['gender'])
df_only

Unnamed: 0,gender_value,frame_number
0,Man,433
1,Woman,108


In [21]:
df_only = df_only.rename(columns={'frame_number': 'only_1_gender', 'gender_value':'gender'})
df_only

Unnamed: 0,gender,only_1_gender
0,Man,433
1,Woman,108


In [22]:
dfg = [df_gender, df_only]
dfg

[  gender  frame_number  gender_screentime
 0    Man           954          81.122449
 1  Woman           222          18.877551,
   gender  only_1_gender
 0    Man            433
 1  Woman            108]

In [48]:
df_gender_final = reduce(lambda left,right: pd.merge(left,right,on='gender'), dfg)
df_gender_final

Unnamed: 0,gender,frame_number,gender_screentime,only_1_gender
0,Man,954,81.122449,433
1,Woman,222,18.877551,108


In [24]:
df_gender_final['only_1_screentime'] = df_gender_final['only_1_gender']/df_gender_final['only_1_gender'].sum()*100
df_gender_final

Unnamed: 0,gender,frame_number,gender_screentime,only_1_gender,only_1_screentime
0,Man,954,81.122449,433,80.036969
1,Woman,222,18.877551,108,19.963031


## Race

In [25]:
df_race = df.groupby(['race']).count().reset_index()
df_race

Unnamed: 0,race,frame_number,face_id,gender
0,asian,103,103,103
1,black,72,72,72
2,indian,7,7,7
3,latino hispanic,98,98,98
4,middle eastern,96,96,96
5,white,800,800,800


In [26]:
df_race['frame_number'].sum()

1176

In [27]:
df_race['race_screentime']=df_race['frame_number']/df_race['frame_number'].sum()*100

In [28]:
df_race = df_race.drop(columns=['face_id','gender'])
df_race

Unnamed: 0,race,frame_number,race_screentime
0,asian,103,8.758503
1,black,72,6.122449
2,indian,7,0.595238
3,latino hispanic,98,8.333333
4,middle eastern,96,8.163265
5,white,800,68.027211


Total number of women

In [29]:
woman_df = df_movie[df_movie['gender']=='Woman']
woman_df

Unnamed: 0,frame_number,face_id,gender,race
13,151.0,13.0,Woman,latino hispanic
43,651.0,43.0,Woman,latino hispanic
46,671.0,46.0,Woman,white
47,681.0,47.0,Woman,latino hispanic
49,691.0,49.0,Woman,middle eastern
...,...,...,...,...
1143,9631.0,1143.0,Woman,white
1147,9641.0,1147.0,Woman,white
1166,9731.0,1166.0,Woman,white
1167,9731.0,1167.0,Woman,black


In [30]:
women_by_race = woman_df.groupby(['race']).count().reset_index()
women_by_race

Unnamed: 0,race,frame_number,face_id,gender
0,asian,10,10,10
1,black,3,3,3
2,indian,1,1,1
3,latino hispanic,15,15,15
4,middle eastern,8,8,8
5,white,185,185,185


In [31]:
women_by_race = women_by_race.drop(columns=['face_id','gender'])
women_by_race

Unnamed: 0,race,frame_number
0,asian,10
1,black,3
2,indian,1
3,latino hispanic,15
4,middle eastern,8
5,white,185


In [32]:
women_by_race = women_by_race.rename(columns={'frame_number': 'woman_frames'})
women_by_race

Unnamed: 0,race,woman_frames
0,asian,10
1,black,3
2,indian,1
3,latino hispanic,15
4,middle eastern,8
5,white,185


Total number of men

In [33]:
man_df = df_movie[df_movie['gender']=='Man']
man_df

Unnamed: 0,frame_number,face_id,gender,race
0,31.0,0.0,Man,white
1,51.0,1.0,Man,white
2,71.0,2.0,Man,black
3,91.0,3.0,Man,latino hispanic
4,101.0,4.0,Man,asian
...,...,...,...,...
1170,9731.0,1170.0,Man,white
1171,9741.0,1171.0,Man,black
1172,9741.0,1172.0,Man,black
1174,9761.0,1174.0,Man,asian


In [34]:
men_by_race = man_df.groupby(['race']).count().reset_index()
men_by_race

Unnamed: 0,race,frame_number,face_id,gender
0,asian,93,93,93
1,black,69,69,69
2,indian,6,6,6
3,latino hispanic,83,83,83
4,middle eastern,88,88,88
5,white,615,615,615


In [35]:
men_by_race = men_by_race.drop(columns=['face_id','gender'])
men_by_race

Unnamed: 0,race,frame_number
0,asian,93
1,black,69
2,indian,6
3,latino hispanic,83
4,middle eastern,88
5,white,615


In [36]:
men_by_race = men_by_race.rename(columns={'frame_number': 'man_frames'})
men_by_race

Unnamed: 0,race,man_frames
0,asian,93
1,black,69
2,indian,6
3,latino hispanic,83
4,middle eastern,88
5,white,615


In [37]:
dfs = [df_race, women_by_race, men_by_race]
dfs

[              race  frame_number  race_screentime
 0            asian           103         8.758503
 1            black            72         6.122449
 2           indian             7         0.595238
 3  latino hispanic            98         8.333333
 4   middle eastern            96         8.163265
 5            white           800        68.027211,
               race  woman_frames
 0            asian            10
 1            black             3
 2           indian             1
 3  latino hispanic            15
 4   middle eastern             8
 5            white           185,
               race  man_frames
 0            asian          93
 1            black          69
 2           indian           6
 3  latino hispanic          83
 4   middle eastern          88
 5            white         615]

In [38]:
df_race_final = reduce(lambda left,right: pd.merge(left,right,on='race'), dfs)
df_race_final

Unnamed: 0,race,frame_number,race_screentime,woman_frames,man_frames
0,asian,103,8.758503,10,93
1,black,72,6.122449,3,69
2,indian,7,0.595238,1,6
3,latino hispanic,98,8.333333,15,83
4,middle eastern,96,8.163265,8,88
5,white,800,68.027211,185,615


In [39]:
df_race_final['woman_r_screentime'] = df_race_final['woman_frames']/df_race_final['frame_number']*100
df_race_final

Unnamed: 0,race,frame_number,race_screentime,woman_frames,man_frames,woman_r_screentime
0,asian,103,8.758503,10,93,9.708738
1,black,72,6.122449,3,69,4.166667
2,indian,7,0.595238,1,6,14.285714
3,latino hispanic,98,8.333333,15,83,15.306122
4,middle eastern,96,8.163265,8,88,8.333333
5,white,800,68.027211,185,615,23.125


In [40]:
df_race_final['man_r_screentime'] = df_race_final['man_frames']/df_race_final['frame_number']*100
df_race_final

Unnamed: 0,race,frame_number,race_screentime,woman_frames,man_frames,woman_r_screentime,man_r_screentime
0,asian,103,8.758503,10,93,9.708738,90.291262
1,black,72,6.122449,3,69,4.166667,95.833333
2,indian,7,0.595238,1,6,14.285714,85.714286
3,latino hispanic,98,8.333333,15,83,15.306122,84.693878
4,middle eastern,96,8.163265,8,88,8.333333,91.666667
5,white,800,68.027211,185,615,23.125,76.875


# Final dataframes to csvs

In [41]:
df_race_final

Unnamed: 0,race,frame_number,race_screentime,woman_frames,man_frames,woman_r_screentime,man_r_screentime
0,asian,103,8.758503,10,93,9.708738,90.291262
1,black,72,6.122449,3,69,4.166667,95.833333
2,indian,7,0.595238,1,6,14.285714,85.714286
3,latino hispanic,98,8.333333,15,83,15.306122,84.693878
4,middle eastern,96,8.163265,8,88,8.333333,91.666667
5,white,800,68.027211,185,615,23.125,76.875


In [42]:
df_gender_final

Unnamed: 0,gender,frame_number,gender_screentime,only_1_gender,only_1_screentime
0,Man,954,81.122449,433,80.036969
1,Woman,222,18.877551,108,19.963031


In [43]:
#df_race_final.to_csv(r'Path where you want to store the exported CSV file\File Name.csv', index = False)

In [44]:
#df_gender_final.to_csv(r'Path where you want to store the exported CSV file\File Name.csv', index = False)