# Random Forest e Bagging

### Imports gerais

In [1]:
import pandas as pd
import numpy as np 
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Imports modelos

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import BaggingRegressor,BaggingClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.ensemble import AdaBoostRegressor, AdaBoostClassifier
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

Download Dataset: [movies_multilinear_reg.csv](https://s3.amazonaws.com/caelum-online-public/machine-learning-aprendizado-supervisionado/movies_multilinear_reg.csv)

#### Carregando o dataset

In [3]:
filmes = pd.read_csv('movies_multilinear_reg.csv')

In [4]:
filmes.head(5)

Unnamed: 0,movieId,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Bilheteria
0,1,Toy Story (1995),0,0,0,0,0,0,0,1,1,1,1,0,1,103.46831,11.048216,5623234.602
1,2,Jumanji (1995),0,0,0,0,0,0,0,1,0,0,1,0,1,112.337916,14.927678,5714951.757
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,1,0,0,0,0,116.245732,27.114597,9524339.124
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,0,1,0,0,1,0,120.317732,4.994242,6331568.779
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,1,0,0,0,0,110.023572,19.142246,6409617.277


In [5]:
filmes.shape

(9125, 18)

#### Separando o dataset

In [6]:
filmes.columns

Index(['movieId', 'Titulo', 'Documentary', 'Sci-Fi', 'Mystery', 'Horror',
       'Romance', 'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation',
       'Children', 'Drama', 'Adventure', 'Duracao', 'Investimento',
       'Bilheteria'],
      dtype='object')

In [7]:
filmes_caract = filmes[['Documentary', 'Sci-Fi', 'Mystery', 'Horror',
       'Romance', 'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation',
       'Children', 'Drama', 'Adventure', 'Duracao', 'Investimento']]
filmes_labels = filmes['Bilheteria']

#### Split dos datasets em treino e teste

In [8]:
treino, teste, treino_labels, teste_labels = train_test_split(filmes_caract, filmes_labels)
print('Shape do treino {}, Shape do treino_labels {}'.format(treino.shape, treino_labels.shape))
print('Shape do teste {}, Shape do teste_labels {}'.format(teste.shape, teste_labels.shape))
print('Proporção Treino / Teste: {0:.2f}% / {1:.2f}%'.format(len(treino)/len(filmes_caract)*100, len(teste)/len(filmes_caract)*100))

Shape do treino (6843, 15), Shape do treino_labels (6843,)
Shape do teste (2282, 15), Shape do teste_labels (2282,)
Proporção Treino / Teste: 74.99% / 25.01%


# Usando modelo BaggingRegressor

In [9]:
modelo_br = BaggingRegressor()
modelo_br.fit(treino, treino_labels)

BaggingRegressor()

In [10]:
score_treino = modelo_br.score(treino, treino_labels)
score_teste = modelo_br.score(teste, teste_labels)
print('BaggingRegressor: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

BaggingRegressor: Acertamos 96.36% no treino, Acertamos 78.90% no teste


### Para comparação usando modelo LinearRegression

In [11]:
modelo_reg = LinearRegression()
modelo_reg.fit(treino, treino_labels)

LinearRegression()

In [12]:
score_treino = modelo_reg.score(treino, treino_labels)
score_teste = modelo_reg.score(teste, teste_labels)
print('LinearRegression: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

LinearRegression: Acertamos 82.79% no treino, Acertamos 82.66% no teste


### Usando BaggingRegressor(n_estimators=20)

In [13]:
modelo_br_20 = BaggingRegressor(n_estimators=20)
modelo_br_20.fit(treino, treino_labels)
score_treino = modelo_br_20.score(treino, treino_labels)
score_teste = modelo_br_20.score(teste, teste_labels)
print('BaggingRegressor(n_estimators=20): Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

BaggingRegressor(n_estimators=20): Acertamos 96.86% no treino, Acertamos 79.72% no teste


# Classifcação com BaggingClassifier
Download Dataset: [avaliacoes_usuario.csv](https://s3.amazonaws.com/caelum-online-public/machine-learning-aprendizado-supervisionado/avaliacoes_usuario.csv)

#### Carregando Dataset da avaliações do usuário (gostos)

In [14]:
gostos = pd.read_csv('avaliacoes_usuario.csv')
gostos.head(5)

Unnamed: 0,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Gostou
0,Nixon (1995),0,0,0,0,0,0,0,0,0,0,0,1,0,114.496547,7.930748,1
1,Leaving Las Vegas (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,110.140191,18.276555,1
2,Persuasion (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,105.747597,16.582232,1
3,Babe (1995),0,0,0,0,0,0,0,0,0,0,1,1,0,126.131978,13.004553,1
4,Carrington (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,85.025469,14.41812,1


#### Separando dataset

In [15]:
gostos.columns

Index(['Titulo', 'Documentary', 'Sci-Fi', 'Mystery', 'Horror', 'Romance',
       'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation', 'Children',
       'Drama', 'Adventure', 'Duracao', 'Investimento', 'Gostou'],
      dtype='object')

In [16]:
caract = gostos[['Documentary', 'Sci-Fi', 'Mystery', 'Horror', 'Romance',
       'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation', 'Children',
       'Drama', 'Adventure', 'Duracao', 'Investimento']]
labels = gostos['Gostou']

### Montar treino

In [17]:
treino, teste, treino_labels, teste_labels = train_test_split(caract, labels)

#### Reshaping do treino e teste com numpy

#### Usando BaggingClassifier

In [18]:
modelo_bc = BaggingClassifier()
modelo_bc.fit(treino, treino_labels)

BaggingClassifier()

In [19]:
previsoes = modelo_bc.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com BaggingClassifier no teste: {0:.2f}%'.format(acuracia * 100))

Acuracia com BaggingClassifier no teste: 72.94%


## Usando RandomForestRegressor

Download Dataset: [movies_multilinear_reg.csv](https://s3.amazonaws.com/caelum-online-public/machine-learning-aprendizado-supervisionado/movies_multilinear_reg.csv)

#### Carregando o dataset

In [20]:
filmes = pd.read_csv('movies_multilinear_reg.csv')
filmes.head(5)

Unnamed: 0,movieId,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Bilheteria
0,1,Toy Story (1995),0,0,0,0,0,0,0,1,1,1,1,0,1,103.46831,11.048216,5623234.602
1,2,Jumanji (1995),0,0,0,0,0,0,0,1,0,0,1,0,1,112.337916,14.927678,5714951.757
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,1,0,0,0,0,116.245732,27.114597,9524339.124
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,0,1,0,0,1,0,120.317732,4.994242,6331568.779
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,1,0,0,0,0,110.023572,19.142246,6409617.277


#### Separando o dataset

In [21]:
filmes_caract = filmes[['Documentary', 'Sci-Fi', 'Mystery', 'Horror',
       'Romance', 'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation',
       'Children', 'Drama', 'Adventure', 'Duracao', 'Investimento']]
filmes_labels = filmes['Bilheteria']

#### Split dos datasets em treino e teste

In [22]:
treino, teste, treino_labels, teste_labels = train_test_split(filmes_caract, filmes_labels)
print('Shape do treino {}, Shape do treino_labels {}'.format(treino.shape, treino_labels.shape))
print('Shape do teste {}, Shape do teste_labels {}'.format(teste.shape, teste_labels.shape))
print('Proporção Treino / Teste: {0:.2f}% / {1:.2f}%'.format(len(treino)/len(filmes_caract)*100, len(teste)/len(filmes_caract)*100))

Shape do treino (6843, 15), Shape do treino_labels (6843,)
Shape do teste (2282, 15), Shape do teste_labels (2282,)
Proporção Treino / Teste: 74.99% / 25.01%


#### Usando modelo RandomForestRegressor

In [23]:
modelo_rfr = RandomForestRegressor()
modelo_rfr.fit(treino, treino_labels)

RandomForestRegressor()

In [24]:
score_treino = modelo_rfr.score(treino, treino_labels)
score_teste = modelo_rfr.score(teste, teste_labels)
print('RandomForestRegressor: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

RandomForestRegressor: Acertamos 97.28% no treino, Acertamos 80.42% no teste


## Classificação com RandomForestClassifier

#### Carregando Dataset da avaliações do usuário (gostos)

In [25]:
gostos = pd.read_csv('avaliacoes_usuario.csv')

In [26]:
gostos.head()

Unnamed: 0,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Gostou
0,Nixon (1995),0,0,0,0,0,0,0,0,0,0,0,1,0,114.496547,7.930748,1
1,Leaving Las Vegas (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,110.140191,18.276555,1
2,Persuasion (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,105.747597,16.582232,1
3,Babe (1995),0,0,0,0,0,0,0,0,0,0,1,1,0,126.131978,13.004553,1
4,Carrington (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,85.025469,14.41812,1


#### Separando dataset

In [27]:
gostos.columns

Index(['Titulo', 'Documentary', 'Sci-Fi', 'Mystery', 'Horror', 'Romance',
       'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation', 'Children',
       'Drama', 'Adventure', 'Duracao', 'Investimento', 'Gostou'],
      dtype='object')

In [28]:
caract = gostos[['Documentary', 'Sci-Fi', 'Mystery', 'Horror', 'Romance',
       'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation', 'Children',
       'Drama', 'Adventure', 'Duracao', 'Investimento']]
labels = gostos['Gostou']

#### Montar o treino

In [29]:
treino, teste, treino_labels, teste_labels = train_test_split(caract, labels)

## Usando RandomForestClassifier

In [30]:
modelo_rfc = RandomForestClassifier()
modelo_rfc.fit(treino, treino_labels)

RandomForestClassifier()

In [31]:
previsoes = modelo_rfc.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com RandomForestClassifier no teste: {0:.2f}%'.format(acuracia * 100))

Acuracia com RandomForestClassifier no teste: 77.06%


## Usando RandomForestClassifier(max_features=5, max_depth=5)

In [32]:
modelo_rfc_max5 = RandomForestClassifier(max_features=5, max_depth=5)
modelo_rfc_max5.fit(treino, treino_labels)
previsoes = modelo_rfc_max5.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com RandomForestClassifier no teste: {0:.2f}%'.format(acuracia * 100))

Acuracia com RandomForestClassifier no teste: 78.82%


# Boosting

Download Dataset: [movies_multilinear_reg.csv](https://s3.amazonaws.com/caelum-online-public/machine-learning-aprendizado-supervisionado/movies_multilinear_reg.csv)

#### Carregando o dataset

In [33]:
filmes = pd.read_csv('movies_multilinear_reg.csv')
filmes.head()

Unnamed: 0,movieId,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Bilheteria
0,1,Toy Story (1995),0,0,0,0,0,0,0,1,1,1,1,0,1,103.46831,11.048216,5623234.602
1,2,Jumanji (1995),0,0,0,0,0,0,0,1,0,0,1,0,1,112.337916,14.927678,5714951.757
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,1,0,0,0,0,116.245732,27.114597,9524339.124
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,0,1,0,0,1,0,120.317732,4.994242,6331568.779
4,5,Father of the Bride Part II (1995),0,0,0,0,0,0,0,0,1,0,0,0,0,110.023572,19.142246,6409617.277


#### Split dos datasets em treino e teste

In [34]:
filmes_caract = filmes[['Documentary', 'Sci-Fi', 'Mystery', 'Horror',
       'Romance', 'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation',
       'Children', 'Drama', 'Adventure', 'Duracao', 'Investimento']]
filmes_labels = filmes['Bilheteria']

#### Split dos dados treino e teste

In [35]:
treino, teste, treino_labels, teste_labels = train_test_split(filmes_caract, filmes_labels)
print('Shape do treino {}, Shape do treino_labels {}'.format(treino.shape, treino_labels.shape))
print('Shape do teste {}, Shape do teste_labels {}'.format(teste.shape, teste_labels.shape))
print('Proporção Treino / Teste: {0:.2f}% / {1:.2f}%'.format(len(treino)/len(filmes_caract)*100, len(teste)/len(filmes_caract)*100))

Shape do treino (6843, 15), Shape do treino_labels (6843,)
Shape do teste (2282, 15), Shape do teste_labels (2282,)
Proporção Treino / Teste: 74.99% / 25.01%


## Usando AdaBoostRegressor

In [36]:
modelo_ada = AdaBoostRegressor()
modelo_ada.fit(treino, treino_labels)

AdaBoostRegressor()

In [37]:
score_treino = modelo_ada.score(treino, treino_labels)
score_teste = modelo_ada.score(teste, teste_labels)
print('AdaBoostRegressor: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

AdaBoostRegressor: Acertamos 79.64% no treino, Acertamos 79.26% no teste


## Usando GradientBoostingRegressor 

#### Separando o dataset

In [38]:
modelo_gbr = GradientBoostingRegressor()
modelo_gbr.fit(treino, treino_labels)

GradientBoostingRegressor()

In [39]:
score_treino = modelo_gbr.score(treino, treino_labels)
score_teste = modelo_gbr.score(teste, teste_labels)
print('GradientBoostingRegressor: Acertamos {0:.2f}% no treino, Acertamos {1:.2f}% no teste'.
      format(score_treino * 100, score_teste * 100))

GradientBoostingRegressor: Acertamos 84.29% no treino, Acertamos 82.44% no teste


## Usando AdaBoostClassifier 

#### Carregando Dataset da avaliações do usuário (gostos)

In [40]:
gostos = pd.read_csv('avaliacoes_usuario.csv')
gostos.head()

Unnamed: 0,Titulo,Documentary,Sci-Fi,Mystery,Horror,Romance,Thriller,Crime,Fantasy,Comedy,Animation,Children,Drama,Adventure,Duracao,Investimento,Gostou
0,Nixon (1995),0,0,0,0,0,0,0,0,0,0,0,1,0,114.496547,7.930748,1
1,Leaving Las Vegas (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,110.140191,18.276555,1
2,Persuasion (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,105.747597,16.582232,1
3,Babe (1995),0,0,0,0,0,0,0,0,0,0,1,1,0,126.131978,13.004553,1
4,Carrington (1995),0,0,0,0,1,0,0,0,0,0,0,1,0,85.025469,14.41812,1


#### Separando dataset

In [41]:
caract = gostos[['Documentary', 'Sci-Fi', 'Mystery', 'Horror', 'Romance',
       'Thriller', 'Crime', 'Fantasy', 'Comedy', 'Animation', 'Children',
       'Drama', 'Adventure', 'Duracao', 'Investimento']]
labels = gostos['Gostou']

In [42]:
treino, teste, treino_labels, teste_labels = train_test_split(caract, labels)

In [43]:
modelo_ada_cl = AdaBoostClassifier()
modelo_ada_cl.fit(treino, treino_labels)
previsoes = modelo_ada_cl.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com AdaBoostClassifier no teste: {0:.2f}%'.format(acuracia * 100))

Acuracia com AdaBoostClassifier no teste: 80.00%


## Usando GradientBoostingClassifier

In [44]:
modelo_gb_cl = GradientBoostingClassifier()
modelo_gb_cl.fit(treino, treino_labels)
previsoes = modelo_gb_cl.predict(teste)
acuracia = accuracy_score(teste_labels, previsoes)
print('Acuracia com GradientBoostingClassifier no teste: {0:.2f}%'.format(acuracia * 100))

Acuracia com GradientBoostingClassifier no teste: 77.65%
