Méthodes d'ensemble
===================

**Author:** Laurent Siksous




# Table des matières







## Preamble



### Imports



In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error

import pickle

# Out[69]:

    # Out[2]:



### Functions



In [1]:
# Display all
def display_all(df):
    with pd.option_context("display.max_rows", 100, "display.max_columns", 100): 
        display(df)

# Out[70]:

### Org



In [1]:
# Org-mode table formatter
import IPython
import tabulate

class OrgFormatter(IPython.core.formatters.BaseFormatter):
    format_type = IPython.core.formatters.Unicode('text/org')
    print_method = IPython.core.formatters.ObjectName('_repr_org_')

def pd_dataframe_to_org(df):
    return tabulate.tabulate(df, headers='keys', tablefmt='orgtbl', showindex='always')

ip = get_ipython()
ip.display_formatter.formatters['text/org'] = OrgFormatter()

f = ip.display_formatter.formatters['text/org']
f.for_type_by_name('pandas.core.frame', 'DataFrame', pd_dataframe_to_org)

# Out[71]:

## Model Selection



### Load Data



In [1]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

display_all(train.describe(include='all').T)

#+begin_example
# Out[72]:
|            |   count |       mean |         std |         min |        25% |        50% |        75% |         max |
|------------+---------+------------+-------------+-------------+------------+------------+------------+-------------|
| MedInc     |   16512 |    3.88075 |    1.90429  |    0.4999   |    2.5667  |    3.5458  |    4.77318 |    15.0001  |
| HouseAge   |   16512 |   28.6083  |   12.6025   |    1        |   18       |   29       |   37       |    52       |
| AveRooms   |   16512 |    5.43524 |    2.38737  |    0.888889 |    4.45205 |    5.23587 |    6.06104 |   141.909   |
| AveBedrms  |   16512 |    1.09668 |    0.433215 |    0.333333 |    1.00651 |    1.04929 |    1.10035 |    25.6364  |
| Population |   16512 | 1426.45    | 1137.06     |    3        |  789       | 1167       | 1726       | 35682       |
| AveOccup   |   16512 |    3.09696 |   11.5787   |    0.692308 |    2.4288  |    2.81724 |    3.28    |  1243.33    |
| Latitude   |   1651

### Normalizing



In [1]:
X = train.drop('AvePrice', axis=1).to_numpy()
X_test = test.drop('AvePrice', axis=1).to_numpy()
y = train['AvePrice'].to_numpy()
y_test = test['AvePrice'].to_numpy()

features = test.columns

# Out[73]:

In [1]:
scaler = MaxAbsScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

test = pd.DataFrame(data = np.c_[X_test_scaled, y_test],
                    columns = features)
test.to_csv('data/test.csv', index=False)

# Out[52]:

## Training



In [1]:
rf = RandomForestRegressor(random_state=42)

param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8]
}

gm = GridSearchCV(estimator=rf, param_grid=param_grid, cv= 5)
gm.fit(X, y)
gm.best_params_

# Out[74]:
: {'max_depth': 8, 'max_features': 'log2', 'n_estimators': 500}

In [1]:
gm.score(X, y)
#gm.cv_results_

# Out[78]:
: 0.7980230776655566

### Save model



In [1]:
with open('data/RF_california.pkl', 'wb') as file:
    pickle.dump(gm.best_estimator_, file)

# Out[79]:

## Bibliography



### References:PROPERTIES:




[california.bib](california.bib)

