## Regression template

### 1. Import libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
from tqdm.notebook import tqdm

In [2]:
random.seed(42)
np.random.seed(42)
pd.options.display.max_columns = 50
pd.options.display.max_rows = 100
tqdm.pandas()
sns.set()

In [None]:
from sklearn.model_selection import train_test_split, cross_validate, RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
import xgboost as xgb

In [None]:
from helper_funcs import *

### 2. Reading data

In [None]:
df = pd.read_csv('',index_col=0)

### 3. Exploring data

### 4. Feature engineering

### 5. Preparing X,y

In [None]:
X = df[feats]

y = df[target]

In [None]:
X_hot = pd.get_dummies(X)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_hot, y, test_size=0.20, random_state=42)

### 6. Models with default parameters

#### 6.1 Random forest

In [None]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)

In [None]:
rf_scores = score_model(rf, X_train, y_train, X_test, y_test)

In [None]:
rf = RandomForestRegressor(n_estimators=100)
cv_results = cross_validate(rf, X_train, y_train, cv=5, scoring=['neg_mean_absolute_error','r2'])

print('test scores r2 mean:',cv_results['test_r2'].mean(), 'test scores r2 std:',cv_results['test_r2'].std())
print('test scores r2 mean:',cv_results['test_r2'].mean(), 'test scores r2 std:',cv_results['test_r2'].std())

### 7. Grid search

#### 7.1 Random forest

In [None]:
n_estimators = [1,5,10,50,100,200,500]
max_depth = [1,5,10,20,30,50]

max_features = ['auto', 'sqrt']
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

%%time
#neg_mean_absolute_error, neg_mean_squared_error
rf_grid = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator=rf_grid, param_distributions=random_grid, scoring='neg_mean_absolute_error',n_iter=100, cv=5, verbose=2, random_state=42, n_jobs=-1)
rf_random.fit(X_train, y_train)

rf_random.best_params_

rf_best = rf_random.best_estimator_
rf_best_scores = score_model(rf_best, X_train, y_train, X_test, y_test, name='best_rf')