In [23]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import scipy.stats as stats
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/MLDS/Aula 6/TMDB_all_movies.csv')
df.shape

(906547, 25)

In [62]:
# Mudar original_language em uma variável binária (se é ou não inglês)
df['is_en'] =  df['original_language'].apply(lambda x: 1 if x == 'en' else 0)

# Verificar se país de produção contém Estados Unidos, então marca 1, senão, marca 0
df['is_usa'] = df['production_countries'].apply(lambda x: 1 if isinstance(x, str) and 'United States of America' in x else 0)

df['revenue_linear'] = df['revenue'].apply(lambda x: np.log(x) if x > 0 else 0)
df['budget_linear'] = df['budget'].apply(lambda x: np.log(x) if x > 0 else 0)

# verificar se ano do lançamento é maior que 2000
df['year'] = pd.to_datetime(df['release_date']).apply(lambda x:1 if x.year > 2000 else 0)

# missing: trato missing removendo a observação
df1 = df
df1 = df1[df1['vote_count'] > 500]
df1 = df1[df1['revenue'] > 0]
df1 = df1[df1['budget'] > 0]

In [44]:

df1.isnull().sum()
df1 = df1.dropna()


id                           0
title                        0
vote_average                 0
vote_count                   0
status                       0
release_date                 0
revenue                      0
runtime                      0
budget                       0
imdb_id                      0
original_language            0
original_title               0
overview                     0
popularity                   0
tagline                    260
genres                       0
production_companies         0
production_countries         0
spoken_languages             1
cast                       216
director                   215
director_of_photography    464
writers                    219
producers                  250
music_composer             981
is_en                        0
is_usa                       0
revenue_linear               0
budget_linear                0
year                         0
dtype: int64

In [66]:
# separar amostras de treino e amostras de teste
df_train, df_test = train_test_split(df1, test_size=0.5)

# criei minhas amostras de teste com 10%
samples = [df_test.sample(frac=0.1) for _ in range(20)]


In [69]:

X_train = df_train[['revenue_linear', 'budget_linear', 'is_en', 'is_usa', 'year']].to_numpy()
Y_train = df_train['vote_average'] # quem eu quero explicar!!!

linear_model = LinearRegression()
linear_model.fit(X_train, Y_train)
linear_score = linear_model.score(X_train, Y_train)

# linear: 1 beta para cada X
# polinomial: Eu tenho N betas para cada X, tal que N = grau da regressão polinomial

# equação regressão linear
# ŷ = Beta0 + Beta1 * X1 + Beta2 * X2 + BetaN * XN + E

# equação da regressão polinomial grau 2
# ŷ = Beta0 + Beta11 * X1 + Beta12 * X1 + Beta21 * X2 + Beta22 * X2 + BetaN1 * XN + BetaN2 * XN + E

poly_features = PolynomialFeatures(degree=2)
X_poly_train = poly_features.fit_transform(X_train)
model_poly = LinearRegression()
model_poly.fit(X_poly_train, Y_train)
poly_score = model_poly.score(X_poly_train, Y_train)

test_score_poly = []
test_score_linear = []
for sample in samples:
  X_test = sample[['revenue_linear', 'budget_linear', 'is_en', 'is_usa', 'year']].to_numpy()
  Y_test = sample['vote_average']
  test_score_linear.append(linear_model.score(X_test, Y_test))

  X_poly_test = poly_features.fit_transform(X_test)
  test_score_poly.append(model_poly.score(X_poly_test, Y_test))

alpha = 0.1
t_test, p_value = stats.ttest_1samp(test_score_linear, popmean=linear_score)
if p_value > alpha:
  print("Aceito H0 para linear")
else:
  print("Aceito H1 para linear")
print("p-valor linear: ", p_value)
print("Linear score: ", linear_score)
print()

t_test, p_value = stats.ttest_1samp(test_score_poly, popmean=poly_score)
if p_value > alpha:
  print("Aceito H0 para poly")
else:
  print("Aceito H1 para poly")
print("p-valor poly: ", p_value)
print("Poly score: ", poly_score)

Aceito H0 para linear
p-valor linear:  0.2535187059570082
Linear score:  0.0621681478695395

Aceito H0 para poly
p-valor poly:  0.38913648512091636
Poly score:  0.06790268805679767


Aceito H0 para linear
p-valor linear:  0.20799141505854793
Linear score:  0.0623370865363061

Aceito H0 para poly
p-valor poly:  0.8888027641294789
Poly score:  0.06636034316410555