# 1. Import packages

In [1]:
import os
import numpy as np 
import pandas as pd

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
from sklearn import set_config
set_config(display='diagram')

KeyboardInterrupt: 

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# 2. Load data

In [None]:
train_data = pd.read_csv("Data/30-days-of-ml/train.csv", index_col="id")
display(train_data.head())

In [None]:
print("train data shape:", train_data.shape)

In [None]:
print("total NaN values train data: ", train_data.isna().sum().sum())

In [None]:
print("total duplicated values in train data:", train_data.duplicated().sum())

# 3. EDA

In [None]:
# summary of train data
train_data.describe().T

## 3.1 Target variable

In [None]:
fig = px.histogram(
    train_data, 
    x="target", 
    marginal='box', 
    histnorm="percent", 
    nbins=20, 
    width=800, 
    height=600
)
fig.show()

In [None]:
# drop outliers
Q1 = train_data.target.quantile(0.25)
Q3 = train_data.target.quantile(0.75)
IQR = Q3 - Q1
train_data_filter = train_data.query(
'(@Q1 - 1.5 * @IQR) <= target <= (@Q3 + 1.5 * @IQR)'
)
train_data_filter.head()

In [None]:
train_data_filter.shape

In [None]:
train_data_filter.describe().T

In [None]:
fig = px.histogram(
    train_data_filter, 
    x="target", 
    marginal='box', 
    histnorm="percent", 
    nbins=20, 
    width=800, 
    height=600
)
fig.show()

## 3.2 Other variables

In [None]:
#numerical columns
numerical_selector = make_column_selector(dtype_exclude=object)
numerical_columns  = numerical_selector(train_data_filter)   

In [None]:
fig, axes = plt.subplots(ncols = 3, nrows = 5, figsize = (3*7, 7*5))

for ax, col_name in zip(axes.flatten(), numerical_columns):
    if col_name == "target":
        fig.delaxes(ax)
    else:
        ax.set_title(col_name)
        ax.plot(train_data_filter[col_name], ls="None", marker="o", ms=1, alpha=0.1)

In [None]:
#categorical columns
categorical_selector = make_column_selector(dtype_include=object)
categorical_columns  = categorical_selector(train_data_filter)
len(categorical_columns)

In [None]:
fig, axes = plt.subplots(ncols = 5, nrows = 2, figsize = (25, 10))

for ax, col_name in zip(axes.flatten(), categorical_columns):
    value_counts = train_data_filter[col_name].value_counts().sort_index()
    x = value_counts.index.to_list()
    y = np.array(value_counts.to_list())*100/len(train_data_filter)
    ax.bar(x = x, height=y)
    ax.set_title(col_name)

# 4 Model

In [None]:
features = train_data_filter.drop("target", axis=1)
target   = train_data_filter.target

numerical_columns   = numerical_selector(features)   
categorical_columns = categorical_selector(features)

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

ct = make_column_transformer(
    (MinMaxScaler(), numerical_columns),
    (OrdinalEncoder(), categorical_columns)
)

pipe = Pipeline([
    ('scaler', ct), 
    ('rf', RandomForestRegressor(criterion="mse", random_state=42))
])
pipe

In [None]:
parameters = {
    'rf__n_estimators' : [100, 250, 500, 1000],
}
model = GridSearchCV(pipe, parameters, cv = 5, scoring = mean_squared_error, n_jobs=-1, verbose=1)
model

In [None]:
model.fit(X_train, y_train)

# Final predictions

In [None]:
# test_data  = pd.read_csv("Data/30-days-of-ml/test.csv", index_col="id")
# display(test_data.head())