In [None]:
# To count Oscar Appeal we use Negative Binomial Regression.

# how-to NB regression:
#    https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

In [None]:
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
df = pd.read_csv('data_with_topics.csv', dtype={'id': 'str'})

In [None]:
# setting test and train data

features = ['gr_low', 'gr_high', 'kw_low', 'kw_high', 'actors_nomineed', 'directors_nomineed', 
            'writers_nomineed', 'major', 'indimajor', 'dy_1', 'dy_2', 'dy_3', 'const']

mask = np.random.rand(len(df[df['year'] < 2019])) < 0.8

df_ = sm.add_constant(df)
df_train = df_[df_['year'] < 2019][mask]
df_test = df_[df_['year'] < 2019][~mask]

y_train = df_train['n_oscars_nom']
y_test = df_test['n_oscars_nom']

X_train = df_train[features]
X_test = df_test[features]

In [None]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.summary())

In [None]:
df_train['BB_LAMBDA'] = poisson_training_results.mu

In [None]:
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['n_oscars_nom'] - x['BB_LAMBDA'])**2 - x['n_oscars_nom']) / x['BB_LAMBDA'], axis=1)

In [None]:
# The ‘-1’ at the end of the expression is patsy syntax for saying: 
# do not to use an intercept of regression; i.e. just fit a straight line passing 
# through the origin, as suggested by Messrs Cameron and Trivedi.

ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [None]:
print(aux_olsr_results.params)

In [None]:
aux_olsr_results.tvalues

In [None]:
nb2_training_results = sm.GLM(y_train, 
                              X_train,
                              family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()

In [None]:
print(nb2_training_results.summary())

In [None]:
df['oscar_appeal'] = nb2_training_results.get_prediction(df_[features]).predicted_mean
df = df.sort_values('oscar_appeal', ascending=False)

In [None]:
# create oscar bait score that is in range(0, 100)

ap_results = {}
for year in range(1990, 2020):
    scores = sorted(list(df[df['year'] == year]['oscar_appeal']))
    ap_results[year] = {scores[i]:i+1 for i in range(len(scores))}
    
df['rating'] = df.apply(lambda row:ap_results[row['year']][row['oscar_appeal']]/len(df[df['year']==row['year']]) * 100, axis=1)   

In [None]:
df.to_csv('predicted.csv', index=False)