# Análisis de regresión

In [45]:
import numpy as np
import pandas as pd
import cufflinks as cf
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.feature_selection import SelectKBest, f_classif, f_regression
from sklearn.metrics import accuracy_score, roc_auc_score, mean_absolute_error, r2_score

cf.go_offline()
pd.set_option("display.max_columns", 50)
pd.set_option('display.float_format', lambda x: "{:,.2f}".format(x))

## Funciones relevantes

In [46]:
def normalize_content(texto):
    if texto in ['G', 'TV-G']:
        return "Publico General"
    if texto in ['R', 'NC-17', 'X']:
        return "Restricted"
    if texto in ['M', 'PG', 'GP', 'PG-13','TV-PG','TV-14'] :
        return "Parental Guiadance"
    if texto in ['Approved', 'Passed']:
        return "Approved"
    if texto in ['Not Rated', 'Unrated']:
        return "Not Rated"

In [47]:
def plot_histogram(df, feature):
    return df[[feature]].iplot(kind="hist", title = f"{feature} histogram", colors=["#296EAA"])

## Carga de datos

In [48]:
df = pd.read_csv("../data/movies.csv")

In [49]:
df.shape

(5043, 16)

In [50]:
df.sample(5)

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
2949,Despicable Me,http://www.imdb.com/title/tt1323594/?ref_=fn_t...,Color,,87.0,,Animation,Comedy,Family,385943,0,English,USA,PG,2010.0,7.7
2502,The Pirate,http://www.imdb.com/title/tt0040694/?ref_=fn_t...,Color,Romance,102.0,2956000.0,Adventure,Comedy,Musical,3258,0,English,USA,Approved,1948.0,7.1
922,Fright Night,http://www.imdb.com/title/tt1438176/?ref_=fn_t...,Color,,106.0,,Comedy,Horror,,85024,1,English,USA,R,2011.0,6.4
3800,The Story of Us,http://www.imdb.com/title/tt0160916/?ref_=fn_t...,Color,,95.0,,Comedy,Drama,Romance,18404,1,English,USA,R,1999.0,5.9
2661,Things to Do in Denver When You're Dead,http://www.imdb.com/title/tt0114660/?ref_=fn_t...,Color,,115.0,529766.0,Crime,Drama,,22345,2,English,USA,R,1995.0,6.8


## Análisis exploratorio

In [51]:
ls_disc = ["color", "genre_4", "genre_1", "genre_2", "genre_3", "language", "country", "content_rating"]
ls_cont = ["duration", "gross", "num_voted_users", "facenumber_in_poster", "title_year"]
target = "imdb_score"

In [52]:
plot_histogram(df=df, feature=target)

In [53]:
df[ls_cont+[target]].describe(percentiles = [0.1, 0.5, 0.95, 0.99])

Unnamed: 0,duration,gross,title_year,imdb_score
count,5028.0,1505.0,4935.0,5043.0
mean,107.2,47523599.37,2042.59,6.46
std,25.2,70034507.15,2818.52,1.71
min,7.0,721.0,1916.0,1.6
10%,86.0,335377.2,1988.0,5.0
50%,103.0,24792061.0,2005.0,6.6
95%,146.0,177159421.4,2015.0,8.1
99%,189.0,336530303.0,2016.0,8.5
max,511.0,760505847.0,200000.0,98.0


In [54]:
df[df["gross"] >= 7.6*10**8]

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
4425,Avatar,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,Color,Sci-Fi,178.0,760505847.0,Action,Adventure,Fantasy,886204,0,English,USA,PG-13,2009.0,7.9


In [55]:
df[df[target] == 98]


Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
3319,Sanctuary,nudity|party|pirate|swimsuit|three word title,Color,,82.0,,Comedy,Horror,Thriller,Quite a Conundrum,John Lucas,8,English,USA,200000.0,98.0


In [56]:
df = df[df[target] != 98].reset_index(drop = True)

In [57]:
df["num_voted_users"] = df["num_voted_users"].astype(float)

In [58]:
df[ls_cont+[target]].describe(percentiles = [0.1, 0.5, 0.95, 0.99])

Unnamed: 0,duration,gross,num_voted_users,title_year,imdb_score
count,5027.0,1505.0,5042.0,4934.0,5042.0
mean,107.21,47523599.37,83684.73,2002.47,6.44
std,25.2,70034507.15,138493.99,12.48,1.13
min,7.0,721.0,5.0,1916.0,1.6
10%,86.0,335377.2,1649.7,1988.0,5.0
50%,103.0,24792061.0,34371.0,2005.0,6.6
95%,146.0,177159421.4,332265.45,2015.0,8.09
99%,189.0,336530303.0,681112.44,2016.0,8.5
max,511.0,760505847.0,1689764.0,2016.0,9.5


In [59]:
for col in ls_disc:
    display(df[col].value_counts(True).reset_index())

Unnamed: 0,index,color
0,Color,0.96
1,Black and White,0.04


Unnamed: 0,index,genre_4
0,Thriller,0.34
1,Romance,0.13
2,Family,0.1
3,Sci-Fi,0.08
4,Fantasy,0.07
5,Mystery,0.05
6,War,0.04
7,Drama,0.04
8,Sport,0.03
9,History,0.03


Unnamed: 0,index,genre_1
0,Comedy,0.26
1,Action,0.23
2,Drama,0.19
3,Adventure,0.09
4,Crime,0.07
5,Biography,0.05
6,Horror,0.05
7,Documentary,0.02
8,Animation,0.01
9,Fantasy,0.01


Unnamed: 0,index,genre_2
0,Drama,0.27
1,Adventure,0.11
2,Crime,0.09
3,Romance,0.09
4,Comedy,0.08
5,Horror,0.05
6,Thriller,0.05
7,Mystery,0.05
8,Family,0.04
9,Fantasy,0.04


Unnamed: 0,index,genre_3
0,Thriller,0.18
1,Romance,0.15
2,Drama,0.12
3,Sci-Fi,0.09
4,Fantasy,0.07
5,Family,0.06
6,Comedy,0.06
7,Mystery,0.06
8,Crime,0.04
9,Horror,0.03


Unnamed: 0,index,language
0,English,0.93
1,French,0.01
2,Spanish,0.01
3,Hindi,0.01
4,Mandarin,0.01
5,German,0.0
6,Japanese,0.0
7,Italian,0.0
8,Russian,0.0
9,Cantonese,0.0


Unnamed: 0,index,country
0,USA,0.76
1,UK,0.09
2,France,0.03
3,Canada,0.03
4,Germany,0.02
...,...,...
60,Kenya,0.00
61,Bahamas,0.00
62,Aruba,0.00
63,Egypt,0.00


Unnamed: 0,index,content_rating
0,R,0.45
1,PG-13,0.31
2,PG,0.15
3,Not Rated,0.02
4,G,0.02
5,Unrated,0.01
6,Approved,0.01
7,TV-14,0.01
8,TV-MA,0.0
9,TV-PG,0.0


In [60]:
df[ls_cont+ls_disc].isnull().sum() / len(df)

duration               0.00
gross                  0.70
num_voted_users        0.00
facenumber_in_poster   0.00
title_year             0.02
color                  0.00
genre_4                0.72
genre_1                0.00
genre_2                0.13
genre_3                0.39
language               0.00
country                0.00
content_rating         0.06
dtype: float64

In [61]:
ls_drop = ["gross", "genre_4", "genre_3", "title_year"]


In [62]:
ls_cont = [x for x in ls_cont if x not in ls_drop]
ls_disc = [x for x in ls_disc if x not in ls_drop]

In [63]:
df = df.dropna(subset = ls_cont+ls_disc).reset_index(drop = True)

In [64]:
df.shape

(4168, 16)

In [65]:
df.sample(10)

Unnamed: 0,movie_title,movie_imdb_link,color,genre_4,duration,gross,genre_1,genre_2,genre_3,num_voted_users,facenumber_in_poster,language,country,content_rating,title_year,imdb_score
1882,Lord of War,http://www.imdb.com/title/tt0399295/?ref_=fn_t...,Color,,122.0,,Crime,Drama,Thriller,248123.0,2,English,USA,R,2005.0,7.6
3364,Galaxina,http://www.imdb.com/title/tt0080771/?ref_=fn_t...,Color,,95.0,,Comedy,Sci-Fi,,1955.0,0,English,USA,R,1980.0,3.4
1692,Money Monster,http://www.imdb.com/title/tt2241351/?ref_=fn_t...,Color,,98.0,,Crime,Drama,Thriller,19611.0,1,English,USA,R,2016.0,6.7
1051,The Matrix Reloaded,http://www.imdb.com/title/tt0234215/?ref_=fn_t...,Color,,138.0,,Action,Sci-Fi,,421818.0,0,English,USA,R,2003.0,7.2
948,Red Cliff,http://www.imdb.com/title/tt0425637/?ref_=fn_t...,Color,History,150.0,,Action,Adventure,Drama,36894.0,4,Mandarin,China,R,2008.0,7.4
2086,Chasing Amy,http://www.imdb.com/title/tt0118842/?ref_=fn_t...,Color,,113.0,12006514.0,Comedy,Drama,Romance,114070.0,5,English,USA,R,1997.0,7.3
28,Jack Reacher,http://www.imdb.com/title/tt0790724/?ref_=fn_t...,Color,Thriller,130.0,,Action,Crime,Mystery,226583.0,2,English,USA,PG-13,2012.0,7.0
2381,The Mighty Ducks,http://www.imdb.com/title/tt0104868/?ref_=fn_t...,Color,Sport,100.0,,Comedy,Drama,Family,44502.0,8,English,USA,PG,1992.0,6.4
944,Unbroken,http://www.imdb.com/title/tt1809398/?ref_=fn_t...,Color,War,137.0,115603980.0,Biography,Drama,Sport,103589.0,0,English,USA,PG-13,2014.0,7.2
2155,Timber Falls,http://www.imdb.com/title/tt0857295/?ref_=fn_t...,Color,,100.0,,Horror,Thriller,,5012.0,1,English,USA,R,2007.0,5.3


In [66]:
df["content_rating"] = df["content_rating"].map(normalize_content)

## Modelado

### Preparación de sets

In [67]:
X = df.drop(columns=ls_disc)[ls_cont].join(pd.get_dummies(df[ls_disc]))
y = df[target]

In [68]:
X.shape

(4168, 134)

In [69]:
kb = SelectKBest(k = 5, score_func=f_regression)


In [70]:
kb.fit(X, y)

SelectKBest(k=5, score_func=<function f_regression at 0x7fdeb279ec80>)

In [71]:
ls_best = [x for x, y in zip(X.columns, kb.get_support()) if y]

In [72]:
ls_best

['duration',
 'num_voted_users',
 'color_Color',
 'genre_1_Biography',
 'genre_2_Drama']

In [73]:
X = pd.DataFrame(kb.transform(X), columns = ls_best)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Modelos

In [75]:
linreg = LinearRegression()

In [76]:
linreg.fit(X_train, y_train)

LinearRegression()

In [77]:
linreg.predict(X_test)

array([6.57245058, 6.59749946, 6.50483826, ..., 5.9938558 , 6.13375507,
       5.68030575])

In [78]:
r2_score(y_pred=linreg.predict(X_test), y_true=y_test)

0.2879342533987256

In [79]:
linreg.coef_

array([ 4.99323784e-03,  2.95173089e-06, -6.48621090e-01,  4.85084328e-01,
        3.47833170e-01])

In [80]:
linreg.coef_.round(3)

array([ 0.005,  0.   , -0.649,  0.485,  0.348])

In [81]:
mean_absolute_error(y_pred=linreg.predict(X_test), y_true=y_test)

0.6781526953364576

In [39]:
yc_train = (y_train > y_train.mean())*1
yc_test = (y_test > y_train.mean())*1

In [89]:
y_train

951    7.00
3531   5.50
3770   6.50
2768   6.70
68     7.70
       ... 
3026   4.10
2608   5.00
148    7.40
2835   5.40
228    6.70
Name: imdb_score, Length: 3126, dtype: float64

In [82]:
yc_train

3751    0
2492    0
1278    1
3117    1
3606    0
       ..
3226    1
3329    0
1942    1
1458    0
2166    0
Name: imdb_score, Length: 3126, dtype: int64

In [83]:
logreg = LogisticRegression()

In [87]:
logreg.fit(X_train, yc_train)

LogisticRegression()

In [92]:
roc_auc_score(y_score=logreg.predict_proba(X_test)[:,1],y_true=yc_test )


0.5031405156427589

In [86]:
X_train[ls_best]

Unnamed: 0,duration,num_voted_users,color_Color,genre_1_Biography,genre_2_Drama
951,113.00,83560.00,1,0,0
3531,87.00,6265.00,1,0,0
3770,117.00,57674.00,1,0,0
2768,98.00,25450.00,1,0,1
68,109.00,181025.00,1,0,1
...,...,...,...,...,...
3026,97.00,13048.00,1,0,0
2608,85.00,16300.00,1,0,0
148,135.00,90932.00,1,0,1
2835,105.00,39782.00,1,0,1
