In [10]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [11]:
# all data found on github repo
df_reviews = pd.read_csv('https://tinyurl.com/moviereviewsdata')

df_train, df_test = train_test_split(
    df_reviews, 
    test_size = 0.25, 
    random_state = 123
)

## Regression Metrics

In [12]:
# we'll use 'features' later also
features = [
    "review_year_0", 
    "release_year_0",
    "age_sc", 
    "length_minutes_sc", 
    "total_reviews_sc", 
    "word_count_sc", 
    "genre", 
    ]

model =  'rating ~ ' + " + ".join(features)

model_lr_train = smf.ols(formula = model, data = df_train).fit()

In [13]:
predictions = model_lr_train.predict(df_test)

### R2

In [14]:
residual_ss = np.sum((df_test.rating - predictions)**2)
total_ss = np.sum((df_test.rating - np.mean(df_test.rating))**2)

1 - residual_ss / total_ss, r2_score(df_test.rating, predictions)

(0.508431158347433, 0.508431158347433)

In [15]:
# conceptually identical, but slight difference due to
# how calculations are done (not shown)
# np.corrcoef(df_test.rating, predictions)[0, 1]**2

### RMSE

In [16]:
mse = np.mean((df_test.rating - predictions)**2)
mse

mean_squared_error(df_test.rating, predictions)

np.sqrt(mse)

mean_squared_error(df_test.rating, predictions, squared = False)



0.4560513738102493

### MAE

In [17]:
np.mean(abs(df_test.rating - predictions))

mean_absolute_error(df_test.rating, predictions)

0.3704072983307527

### MAPE

In [18]:
np.mean(
    abs(df_test.rating - predictions) / 
    df_test.rating
) * 100

mean_absolute_percentage_error(df_test.rating, predictions) * 100

13.464399850975898

## Classification Metrics