<a href="https://colab.research.google.com/github/mahyarhabibi/GenderGaps_Hollywood/blob/main/Codes/user_reviews_regs_pub.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook generates the results used to estimate the impact of the \#MeToo movement on user reviews displayed in Figure B.1 in Appendix B

In [None]:
import numpy as np
import pandas as pd 
import statsmodels.api as sm
from patsy import dmatrices
import re

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
# Directories to read data and store results
#Please add the parent folder to your Google Drive
# Public Link: https://drive.google.com/drive/folders/1TYCDAJOCiLZw4TObcLac5GnL5YtwYnUD?usp=sharing
parent_dir = "/content/gdrive/MyDrive/GenderGaps_Hollywood/" # You may need to change the address
data_dir = os.path.join(parent_dir, 'Data/')
results_dir = os.path.join(parent_dir,'Results/')

In [None]:
# Loading Data 
df_movies = pd.read_pickle(data_dir + 'movies_info_merged_MIW_pkl.zip')

#Reviews
df_urevs = pd.read_pickle(data_dir + 'reviews_users_merged.zip')
urev_cols = ['Title', 'Year', 'title_year' ,'user', 'Uscore', 'U_count','V_count']
df_urevs = df_urevs[urev_cols]

In [None]:
# Creating a list of genres
genres = set(df_movies['genre'].sum())
for gen in genres:
    df_movies[gen] = df_movies['genre'].map(lambda x: 1 if gen in x else 0)
    
df_movies = df_movies.rename(columns={'Film-Noir': 'FilmNoir', 'Sci-Fi': 
                                      'SciFi', 'Reality-TV': 'RealityTV',
                                      'Talk-Show': 'TalkShow'})

genres = genres - {'Film-Noir','Sci-Fi','Reality-TV', 'Talk-Show'}
genres = list(genres.union({'FilmNoir','SciFi','RealityTV', 'TalkShow'}))

In [None]:
# Matrix of plots embedding
plot_vectors = [f'plot_vec{i}' for i in range(20)]
X_vecs = df_movies[plot_vectors]

In [None]:
# Cast (actors)
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

X_cast = pd.DataFrame(mlb.fit_transform(df_movies['cast']),columns=mlb.classes_, index=df_movies.index)

# Keeping actors whose names showed up at least 5 times
X_cast = X_cast[X_cast.columns[X_cast.sum()>=10]]

In [None]:
# Companies with at least 10 movies
X_company = pd.get_dummies(df_movies['company'])
X_company = X_company[X_company.columns[X_company.sum()>=10]]

# Distributors with at least 10 movies
X_dist = pd.get_dummies(df_movies['Distributor'])
X_dist = X_dist[X_dist.columns[X_dist.sum()>=10]]

In [None]:
#Countries
df_movies['country_list'] = df_movies['country'].map(lambda x: re.sub(' ','',x)).\
                        map(lambda x: x.split(','))

X_country = pd.DataFrame(mlb.fit_transform(df_movies['country_list']),
             columns=mlb.classes_, index=df_movies.index)

X_country = X_country[X_country.columns[X_country.sum()>=10]]


#Language
df_movies['lang_list'] = df_movies['language'].map(lambda x: re.sub(' ','',x)).\
                        map(lambda x: x.split(','))
X_lang = pd.DataFrame(mlb.fit_transform(df_movies['lang_list']),
             columns=mlb.classes_, index=df_movies.index)
X_lang = X_lang[X_lang.columns[X_lang.sum()>=10]]

# Age ratings Text
vector_rating = TfidfVectorizer( ngram_range=(1,2), min_df=0.01, 
                        max_df=0.75, stop_words='english', sublinear_tf=True)
ratings = df_movies['rating_text'].values
X_rating = vector_rating.fit_transform(ratings)
X_rating = X_rating.todense()
X_rating= pd.DataFrame(data=X_rating, columns=vector_rating.get_feature_names_out())

# Age Ratings Categories
X_mpaa = df_movies[['rated_R','rated_PG', 'rated_PG13', 'rated_TVMA', 'rated_TV14']]

# Genres
X_genre = df_movies[genres]

In [None]:
# Merging Features
X_mov = df_movies[['Title', 'Year', 'D_Female']].join(X_vecs).join(X_cast, rsuffix='cs').\
  join(X_company, rsuffix='cp').join(X_dist, rsuffix='ds').\
  join(X_country, rsuffix='cn').join(X_lang, rsuffix='lg').join(X_rating, rsuffix='rt').\
        join(X_mpaa).join(X_genre, rsuffix='gn')

In [None]:
X_score = df_urevs[['Title', 'Year', 'Uscore', 'title_year']].merge(X_mov, on=['Title', 'Year']).drop(columns=['Title'])
X_score['TY_code'] = X_score['title_year'].factorize()[0]
X_score = X_score.drop(columns=['title_year'])

In [None]:
df_urevs.loc[df_urevs['U_count']>=50]['user'].nunique()

533

In [None]:
#users with more than 50 reviews
X_user = pd.get_dummies(df_urevs.loc[df_urevs['U_count']>=50]['user'])
X_year = pd.get_dummies(df_urevs['Year'])

In [None]:
# Creating/loading a file for storing the results
import os.path
import json
if os.path.exists(data_dir + 'users_reg_results.json'):
  with open(data_dir + 'users_reg_results.json', 'r') as input_file:
    users_reg_results = json.load(input_file)
else:
  users_reg_results = {}

In [None]:
# Function to store results
def reviews_reg_sum(reg_res,target_vars,target_locs):
  results_sum = { 'main vars':target_vars, 
                 'coefs': [reg_res.params[x] for x in target_vars],
                'std_errors': [reg_res.bse[x] for x in target_vars],
                't_values': [reg_res.tvalues[x] for x in target_vars],
                'conf_intervals': [ list(reg_res.conf_int().values[i]) for i in target_locs],
                 'R_squared': reg_res.rsquared, 'adj R_squared': reg_res.rsquared_adj, 'N_Obs': reg_res.nobs}

  return results_sum

In [None]:
# Regression all users sample

# DFem*Year: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, user &
# source FE: No , user*D_Female FE: No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

# Data
data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).dropna() 
y_ols = data_ols['Uscore']
X_ols = data_ols.drop(columns='Uscore')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
users_reg_results.update( {'all_users':ols_res_sum })

with open(results_dir + 'users_reg_results.json', 'w') as outfile:
    json.dump(users_reg_results, outfile)

In [None]:
# Regression active users with fixed effects
# DFem*Year+user FE: D_Female: No, Movie Features: Yes, D_Female*Year FE: Yes, user & source FE: Yes , user*D_Female FE:No

# Year * D_Female 
arr_year = X_year.to_numpy()
arr_DFem = X_score['D_Female'].to_numpy()
year_DFem = arr_year * arr_DFem[:,None]
year_DFem_cols = ['DFem_' + str(name) for name in X_year.columns.values]
X_year_DFem = pd.DataFrame(year_DFem, columns=year_DFem_cols, index=X_score.index.values)

data_ols = X_score.drop(columns='D_Female').join(X_year).join(X_year_DFem).\
          join(X_user, rsuffix='auth_').\
          join(df_urevs['U_count']).dropna()

data_ols= data_ols.loc[data_ols['U_count']>=100].drop(columns=['U_count'])


y_ols = data_ols['Uscore']
X_ols = data_ols.drop(columns='Uscore')
TY = X_ols['TY_code'].values.astype(int)
X_ols = X_ols.drop(columns=['TY_code'])

y_ols.astype('float16')
X_ols.astype('float16')

reg_ols = sm.OLS(y_ols, X_ols) 
res_ols = reg_ols.fit(cov_type='cluster', cov_kwds={'groups': TY })

target_vars = year_DFem_cols
target_locs = [X_ols.columns.get_loc(var) for var in target_vars]

ols_res_sum = reviews_reg_sum(res_ols,target_vars, target_locs)
users_reg_results.update( {'active_users':ols_res_sum })

with open(results_dir + 'users_reg_results.json', 'w') as outfile:
    json.dump(users_reg_results, outfile)