In [None]:
# core processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# model
import statsmodels.api as sm
import statsmodels.formula.api as smf



## Main Model

In [None]:
# all data found on github repo
df_reviews = pd.read_csv('https://tinyurl.com/moviereviewsdata')

model_lr_rating = smf.ols('rating ~ word_count', data = df_reviews).fit()

model_lr_rating.summary(slim = True)

In [None]:
all_predictions = model_lr_rating.predict()

df_prediction = pd.DataFrame({'word_count': [5]})
single_prediction = model_lr_rating.predict(df_prediction)

In [None]:
pred_intervals = (
    model_lr_rating
    .get_prediction(df_prediction)
    .summary_frame(alpha = 0.05)
)

pd.DataFrame(pred_intervals)

In [None]:
predictions = model_lr_rating.predict()
y = df_reviews.rating

plt.scatter(y, predictions)

In [None]:
np.sqrt(model_lr_rating.scale)   # RMSE

## Multiple Features

In [None]:
model_lr_rating_extra = smf.ols(
    formula = 'rating ~ word_count \
        + age \
        + review_year \
        + release_year \
        + length_minutes \
        + children_in_home \
        + total_reviews',
    data = df_reviews
).fit()

model_lr_rating_extra.summary(slim = True)

In [None]:
predict_observation = pd.DataFrame(
    {
        'word_count': 12,
        'age': 30,
        'children_in_home': 1,
        'review_year': 2020,
        'release_year': 2015,
        'length_minutes': 100,
        'total_reviews': 10000
    },
    index = ['new_observation']
)

model_lr_rating_extra.predict(predict_observation)

In [None]:
model_lr_cat = smf.ols(
    formula = "rating ~ word_count + season",
    data = df_reviews
).fit()

model_lr_cat.summary(slim = True)

In [None]:
sm.stats.anova_lm(model_lr_cat)