In [None]:
# how-to NB regression:
#    https://towardsdatascience.com/negative-binomial-regression-f99031bb25b4

In [None]:
# В оригинальном экcперименте используется Negative Binomial Regression. Схему работы с ней я взял по ссылке выше

In [1]:
from tqdm import tqdm_notebook

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import statsmodels.formula.api as smf
import statsmodels.api as sm

In [2]:
df = pd.read_csv('data_with_topics.csv', dtype={'id': 'str'})

In [3]:
# setting test and train data

features = ['gr_low', 'gr_high', 'kw_low', 'kw_high', 'actors_nomineed', 'directors_nomineed', 
            'writers_nomineed', 'major', 'indimajor', 'dy_1', 'dy_2', 'dy_3', 'const']

mask = np.random.rand(len(df[df['year'] < 2019])) < 0.8

df_ = sm.add_constant(df)
df_train = df_[df_['year'] < 2019][mask]
df_test = df_[df_['year'] < 2019][~mask]

y_train = df_train['n_oscars_nom']
y_test = df_test['n_oscars_nom']

X_train = df_train[features]
X_test = df_test[features]

  return ptp(axis=axis, out=out, **kwargs)


In [4]:
poisson_training_results = sm.GLM(y_train, X_train, family=sm.families.Poisson()).fit()
print(poisson_training_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           n_oscars_nom   No. Observations:                 9979
Model:                            GLM   Df Residuals:                     9966
Model Family:                 Poisson   Df Model:                           12
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -2329.2
Date:                Thu, 23 Jan 2020   Deviance:                       3667.5
Time:                        15:38:27   Pearson chi2:                 1.73e+04
No. Iterations:                     7   Covariance Type:             nonrobust
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
gr_low                 4.2611      1.626      2.621      0.009       1.075       7.447
gr_high                1.799

In [5]:
df_train['BB_LAMBDA'] = poisson_training_results.mu

In [6]:
df_train['AUX_OLS_DEP'] = df_train.apply(lambda x: ((x['n_oscars_nom'] - x['BB_LAMBDA'])**2 - x['n_oscars_nom']) / x['BB_LAMBDA'], axis=1)

In [7]:
# The ‘-1’ at the end of the expression is patsy syntax for saying: 
# do not to use an intercept of regression; i.e. just fit a straight line passing 
# through the origin, as suggested by Messrs Cameron and Trivedi.

ols_expr = """AUX_OLS_DEP ~ BB_LAMBDA - 1"""
aux_olsr_results = smf.ols(ols_expr, df_train).fit()

In [8]:
print(aux_olsr_results.params)

BB_LAMBDA    1.41283
dtype: float64


In [9]:
aux_olsr_results.tvalues

BB_LAMBDA    2.221362
dtype: float64

In [10]:
nb2_training_results = sm.GLM(y_train, 
                              X_train,
                              family=sm.families.NegativeBinomial(alpha=aux_olsr_results.params[0])).fit()

In [11]:
print(nb2_training_results.summary())

                 Generalized Linear Model Regression Results                  
Dep. Variable:           n_oscars_nom   No. Observations:                 9979
Model:                            GLM   Df Residuals:                     9966
Model Family:        NegativeBinomial   Df Model:                           12
Link Function:                    log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -1926.3
Date:                Thu, 23 Jan 2020   Deviance:                       2285.4
Time:                        15:38:27   Pearson chi2:                 1.68e+04
No. Iterations:                     9   Covariance Type:             nonrobust
                         coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------------------------------------------------
gr_low                 6.3300      2.023      3.129      0.002       2.365      10.295
gr_high                1.966

In [12]:
df['oscar_appeal'] = nb2_training_results.get_prediction(df_[features]).predicted_mean
df = df.sort_values('oscar_appeal', ascending=False)

In [13]:
ap_results = {}
for year in range(1990, 2020):
    scores = sorted(list(df[df['year'] == year]['oscar_appeal']))
    ap_results[year] = {scores[i]:i+1 for i in range(len(scores))}

In [14]:
df['rating'] = df.apply(lambda row:ap_results[row['year']][row['oscar_appeal']]/len(df[df['year']==row['year']]) * 100, axis=1)

In [15]:
df.to_csv('predicted.csv', index=False)