<a href="https://colab.research.google.com/github/mahyarhabibi/GenderGaps_Hollywood/blob/main/Codes/critic_reviews_regs_pub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook generates the results used to estimate the regression model specified in Equation 3.

In [None]:
import numpy as np
import pandas as pd 
import scipy as sc
import statsmodels.api as sm
from patsy import dmatrices
from statsmodels.iolib.summary2 import summary_col
import re

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
# Directories to read data and store results
#Please add the parent folder to your Google Drive
# Public Link: https://drive.google.com/drive/folders/1TYCDAJOCiLZw4TObcLac5GnL5YtwYnUD?usp=sharing
parent_dir = "/content/gdrive/MyDrive/GenderGaps_Hollywood/" # You may need to change the address
data_dir = os.path.join(parent_dir, 'Data/')
results_dir = os.path.join(parent_dir,'Results/')

In [None]:
# Movies data
df_movies = pd.read_pickle(data_dir + 'movies_info_merged_MIW_pkl.zip')

# Reviews data
df_crevs = pd.read_pickle(data_dir + 'reviews_critics_merged.zip')
crev_cols = ['Title', 'Year', 'title_year' ,'source', 'author', 'score','C_count', 
             'R_count', 'C_Female', 'C_start', 'C_end', 'C_avg','C_exper']
df_crevs = df_crevs[crev_cols]

In [None]:
# Creating a list of genres
genres = set(df_movies['genre'].sum())
for gen in genres:
    df_movies[gen] = df_movies['genre'].map(lambda x: 1 if gen in x else 0)
    
df_movies = df_movies.rename(columns={'Film-Noir': 'FilmNoir', 'Sci-Fi': 
                                      'SciFi', 'Reality-TV': 'RealityTV',
                                      'Talk-Show': 'TalkShow'})

genres = genres - {'Film-Noir','Sci-Fi','Reality-TV', 'Talk-Show'}
genres = list(genres.union({'FilmNoir','SciFi','RealityTV', 'TalkShow'}))

In [None]:
# Plots' embedding vectors data
plot_vectors = [f'plot_vec{i}' for i in range(100)]
X_vecs = df_movies[plot_vectors]

In [None]:
#Cast data
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

X_cast = pd.DataFrame(mlb.fit_transform(df_movies['cast']),columns=mlb.classes_, index=df_movies.index)

# Keeping actors whose names showed up at least 5 times
X_cast = X_cast[X_cast.columns[X_cast.sum()>=10]]

In [None]:
# Companies with at least 10 movies
X_company = pd.get_dummies(df_movies['company'])
X_company = X_company[X_company.columns[X_company.sum()>=10]]

# Distributors with at least 10 movies
X_dist = pd.get_dummies(df_movies['Distributor'])
X_dist = X_dist[X_dist.columns[X_dist.sum()>=10]]

In [None]:
#Countries
df_movies['country_list'] = df_movies['country'].map(lambda x: re.sub(' ','',x)).\
                        map(lambda x: x.split(','))

X_country = pd.DataFrame(mlb.fit_transform(df_movies['country_list']),
             columns=mlb.classes_, index=df_movies.index)

X_country = X_country[X_country.columns[X_country.sum()>=10]]

#Language
df_movies['lang_list'] = df_movies['language'].map(lambda x: re.sub(' ','',x)).\
                        map(lambda x: x.split(','))
X_lang = pd.DataFrame(mlb.fit_transform(df_movies['lang_list']),
             columns=mlb.classes_, index=df_movies.index)
X_lang = X_lang[X_lang.columns[X_lang.sum()>=10]]

# Age ratings Text
vector_rating = TfidfVectorizer( ngram_range=(1,2), min_df=0.01, 
                        max_df=0.75, stop_words='english', sublinear_tf=True)
ratings = df_movies['rating_text'].values
X_rating = vector_rating.fit_transform(ratings)
X_rating = X_rating.todense()
X_rating= pd.DataFrame(data=X_rating, columns=vector_rating.get_feature_names_out())

# Age Ratings Categories
X_mpaa = df_movies[['rated_R','rated_PG', 'rated_PG13', 'rated_TVMA', 'rated_TV14']]

# Genres
X_genre = df_movies[genres]

In [None]:
# Merging Features
X_mov = df_movies[['Title', 'Year', 'D_Female']].join(X_vecs).join(X_cast, rsuffix='cs').\
  join(X_company, rsuffix='cp').join(X_dist, rsuffix='ds').\
  join(X_country, rsuffix='cn').join(X_lang, rsuffix='lg').join(X_rating, rsuffix='rt').\
        join(X_mpaa).join(X_genre, rsuffix='gn')


X_score = df_crevs[['Title', 'Year', 'score', 'title_year']].merge(X_mov, on=['Title', 'Year']).drop(columns=['Title'])
X_score['TY_code'] = X_score['title_year'].factorize()[0]
X_score = X_score.drop(columns=['title_year'])
X_source = pd.get_dummies(df_crevs['source'])

#Critics with more than 50 reviews
X_author = pd.get_dummies(df_crevs['author'])
X_author = X_author[X_author.columns[X_author.sum()>=100]]

X_year = pd.get_dummies(df_crevs['Year'])

In [None]:
# Creating/loading a file for storing the results
import os.path
import json
if os.path.exists(results_dir + 'critics_reg_results.json'):
  with open(results_dir + 'critics_reg_results.json', 'r') as input_file:
    critics_reg_results = json.load(input_file)
else:
  critics_reg_results = {}

In [None]:
# function to store regression results
def reviews_reg_sum(reg_res,target_vars,target_locs):
  results_sum = { 'main vars':target_vars, 
                 'coefs': [reg_res.params[x] for x in target_vars],
                'std_errors': [reg_res.bse[x] for x in target_vars],
                't_values': [reg_res.tvalues[x] for x in target_vars],
                 'p_values': [reg_res.pvalues[x] for x in target_vars],
                'conf_intervals': [ list(reg_res.conf_int().values[i]) for i in target_locs],
                 'R_squared': reg_res.rsquared, 'adj R_squared': reg_res.rsquared_adj, 'N_Obs': reg_res.nobs}

  return results_sum

# Regressions: Female Director

In [None]:
df_temp  = df_crevs[['author', 'C_count']].drop_duplicates().reset_index(drop=True)

In [None]:
sum(df_temp['C_count']>100)

370

In [None]:
# Base:  Movie Features: Yes, D_Female*Year FE: Yes,
# critic & source FE: No , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).dropna() 
y_ols = data_ols['score']
X_ols = data_ols.drop(columns='score')

# TY code is used to cluster standard errors
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Base':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
df_crevs['post_2010'] = np.where(df_crevs['Year']>=2010,1,0)
# Number of Reviews after 2010
df_crevs['R_p2010'] = df_crevs.groupby(['author','post_2010'])['Title'].transform('count') * df_crevs['post_2010']
# Max to set it to both pre and post 2010 observations
df_crevs['NoR_p2010'] = df_crevs.groupby(['author'])['R_p2010'].transform('max')
df_crevs = df_crevs.drop(columns={'post_2010', 'R_p2010'})

In [None]:
# # Active Critics: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes
# #, critic & source FE: Yes , Critic*D_Female FE:No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)


data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
          join(X_source,rsuffix='src_').join(X_author, rsuffix='auth_').\
          join(df_crevs['NoR_p2010']).dropna()

data_ols= data_ols.loc[data_ols['NoR_p2010']>=100].drop(columns=['NoR_p2010'])

y_ols = data_ols['score']
X_ols = data_ols.drop(columns='score')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'active_critics':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
# Female Critics Only
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['C_Female']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 

data_ols = data_ols.loc[data_ols['C_Female']==1]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns='score')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Female_critics':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
# Male Critics Only
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & source FE: No , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['C_Female']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 
data_ols = data_ols.loc[data_ols['C_Female']==0]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns='score')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Male_critics':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

#Regression: Female Director and Actors

In [None]:
# Merging Features
X_mov_A = df_movies[['Title', 'Year', 'D_Female','A1_Female', 'A2_Female', 'A3_Female']].join(X_vecs).join(X_cast, rsuffix='cs').\
  join(X_company, rsuffix='cp').join(X_dist, rsuffix='ds').\
  join(X_country, rsuffix='cn').join(X_lang, rsuffix='lg').join(X_rating, rsuffix='rt').\
        join(X_mpaa).join(X_genre, rsuffix='gn')


X_score_A = df_crevs[['Title', 'Year', 'score', 'title_year']].merge(X_mov_A, on=['Title', 'Year']).drop(columns=['Title'])
X_score_A['TY_code'] = X_score_A['title_year'].factorize()[0]
X_score_A = X_score_A.drop(columns=['title_year'])

In [None]:
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & source FE: No , Critic*D_Female FE: No

# # Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score_A['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score_A.index.values)

# Year * A1_Female 
arr_A1Fem = X_score_A['A1_Female'].to_numpy()
year_A1Fem = arr_year * arr_A1Fem[:,None]
year_A1Fem_cols = ['A1Fem_' + str(name) for name in X_year.columns.values]
X_year_A1Fem = pd.DataFrame(year_A1Fem, columns=year_A1Fem_cols, index=X_score_A.index.values)

# Year * A2_Female 
arr_A2Fem = X_score_A['A2_Female'].to_numpy()
year_A2Fem = arr_year * arr_A2Fem[:,None]
year_A2Fem_cols = ['A2Fem_' + str(name) for name in X_year.columns.values]
X_year_A2Fem = pd.DataFrame(year_A2Fem, columns=year_A2Fem_cols, index=X_score_A.index.values)

# Year * A3_Female 
arr_A3Fem = X_score_A['A3_Female'].to_numpy()
year_A3Fem = arr_year * arr_A3Fem[:,None]
year_A3Fem_cols = ['A3Fem_' + str(name) for name in X_year.columns.values]
X_year_A3Fem = pd.DataFrame(year_A3Fem, columns=year_A3Fem_cols, index=X_score_A.index.values)

# # Data

data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).join(X_year_A1Fem).\
            join(X_year_A2Fem).join(X_year_A3Fem).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').join(df_crevs['C_count']).dropna()

# data_ols = data_ols.drop(columns='D_Female')
data_ols= data_ols.loc[data_ols['C_count']>=100].drop(columns=['C_count'])


y_ols = data_ols['score']
X_ols = data_ols.drop(columns='score')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])


reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

DFem_vars = year_DFem_cols 
DFem_locs = [X_ols.columns.get_loc(var) for var in DFem_vars]
ols_res_DFem = reviews_reg_sum(res_ols,DFem_vars, DFem_locs)

A1Fem_vars = year_A1Fem_cols 
A1Fem_locs = [X_ols.columns.get_loc(var) for var in A1Fem_vars]
ols_res_A1Fem = reviews_reg_sum(res_ols,A1Fem_vars, A1Fem_locs)

A2Fem_vars = year_A2Fem_cols 
A2Fem_locs = [X_ols.columns.get_loc(var) for var in A2Fem_vars]
ols_res_A2Fem = reviews_reg_sum(res_ols,A2Fem_vars, A2Fem_locs)

A3Fem_vars = year_A3Fem_cols 
A3Fem_locs = [X_ols.columns.get_loc(var) for var in A3Fem_vars]
ols_res_A3Fem = reviews_reg_sum(res_ols,A3Fem_vars, A3Fem_locs)

ols_res_tot = {'DFem_Year': ols_res_DFem, 'A1Fem_Year': ols_res_A1Fem,
               'A2Fem_Year': ols_res_A2Fem, 'A3Fem_Year': ols_res_A3Fem}
critics_reg_results.update( {'DFem+AFem':ols_res_tot })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

# Top Outlets

In [None]:
# 10 outlets with most reviews
top_outlets = list(df_crevs['source'].value_counts()[:10].index.values)

In [None]:
# Top Outlet
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['source']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 

data_ols['top_outlet'] = data_ols['source'].map(lambda x: 1 if x in top_outlets else 0)
data_ols = data_ols.loc[data_ols['top_outlet']==1]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns=['score', 'source', 'top_outlet'])
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Top Outlets':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
df_outlets_grading = pd.read_csv(data_dir + 'outlets_grading_system.csv')
grading_outlets = df_outlets_grading.loc[((df_outlets_grading['trial 1']=='grade')
                     & (df_outlets_grading['trial 2']=='grade'))]['source'].values.tolist()

In [None]:
#Grading Outlets
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic &
# source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['source']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 

data_ols['grading_outlet'] = data_ols['source'].map(lambda x: 1 if x in grading_outlets else 0)
data_ols = data_ols.loc[data_ols['grading_outlet']==1]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns=['score', 'source', 'grading_outlet'])
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Grading Outlets':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
X_uscore =  df_crevs.merge(df_movies[['Title', 'Year' ,'userscore', 'N_user']],
                           on =['Title','Year'])[['userscore', 'N_user']]

X_uscore = X_score.join(X_uscore)

In [None]:
# Popular Movies
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_uscore['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_uscore.index.values)

# Data
data_ols = X_uscore.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['source']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 

# The median number of user votes is 31 (32 for movies prior to 2010 and 30 for post 2010)
data_ols = data_ols.loc[data_ols['N_user']>30]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns=['score', 'source', 'userscore', 'N_user']) 
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'N_user>30':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
# Top Outlet & Popular Movies
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & 
# source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_uscore['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_uscore.index.values)

# Data
data_ols = X_uscore.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs['source']).join(X_source,rsuffix='src_').\
            join(X_author, rsuffix='auth_').dropna() 

data_ols['top_outlet'] = data_ols['source'].map(lambda x: 1 if x in top_outlets else 0)
data_ols = data_ols.loc[(data_ols['top_outlet']==1) & (data_ols['N_user']>30)]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns=['score', 'source', 'top_outlet','userscore', 'N_user'])
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'Top Outlets & Popular Movies':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)

In [None]:
top_outlets_list = df_crevs['source'].value_counts()[:10].index.tolist()
# Coverage NY POST is from 1996, and Village Voice up to 2019
top_outlets_list = list(set(top_outlets_list) - { 'Village Voice','New York Post'})
top_outlets_set = set(top_outlets_list)

In [None]:
df_crevs['pool_sources'] = df_crevs.groupby('title_year')['source'].transform(lambda x: '+'.join(x))
df_crevs['pool_sources'] = df_crevs['pool_sources'].map(lambda x: set(x.split('+')))
df_crevs['all_top_outlets'] = df_crevs['pool_sources'].map(lambda x: top_outlets_set.issubset(x))
df_crevs['top_outlet'] = df_crevs['source'].isin(top_outlets_list)

In [None]:
#movies appeared in every top oputlets:
# N
# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, critic & 
# source FE: Yes , Critic*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_uscore.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
            join(df_crevs[['source','top_outlet','all_top_outlets']]).dropna() 

# The median number of user votes is 31
data_ols = data_ols.loc[(data_ols['all_top_outlets']==True) & (data_ols['top_outlet']==1)]
y_ols = data_ols['score']
X_ols = data_ols.drop(columns=['score', 'source', 'all_top_outlets', 'top_outlet']) 
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
critics_reg_results.update( {'All Top Outlets':ols_res_sum })

with open(results_dir + 'critics_reg_results.json', 'w') as outfile:
    json.dump(critics_reg_results, outfile)