In [None]:
!kaggle competitions download -c playground-series-s5e2
!unzip -u *.zip

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import missingno

from sklearn import set_config
set_config(transform_output = "pandas")
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import ShuffleSplit, KFold
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.neural_network import MLPRegressor


KAGGLE_RUN = False
if KAGGLE_RUN:
    working_dir = Path('/kaggle/input/playground-series-s5e2')
else:
    working_dir = Path().cwd()


In [None]:
train_df = pd.concat([
    pd.read_csv(working_dir/'train.csv', index_col='id'),
    pd.read_csv(working_dir/'training_extra.csv', index_col='id')
    ])

test_df = pd.read_csv(working_dir/'test.csv')

train_df

In [None]:
train_df.info()

In [None]:
CATEGORIC_COLUMNS = ['Brand', 'Material', 'Size', 'Compartments', 'Laptop Compartment', 'Waterproof', 'Style', 'Color']
NUMERIC_COLUMNS = ['Weight Capacity (kg)']
TARGET_COLUMN = ['Price']

In [None]:
def get_unique_vals(df:pd.DataFrame, column:str) -> None:
    print(f'{column} has the following unique entries {len(df[column].unique())}')
    print(f'{df[column].value_counts()}')
    print('-----------------------------------')


for column in CATEGORIC_COLUMNS:
    get_unique_vals(train_df, column)


In [None]:
train_df.describe().T

In [None]:
missingno.matrix(train_df)

In [None]:
missingno.heatmap(train_df)

In [None]:
missing_values_train = pd.DataFrame({
    'Feature': train_df.columns,
    'No. of Missing Values': train_df.isnull().sum().values,
    '% of Missing Values': ((train_df.isnull().sum().values)/len(train_df)*100)
    })
missing_values_train

In [None]:
train_df.duplicated().sum()

In [None]:
fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(18,10))

sns.histplot(data=train_df, x='Compartments', ax=ax[0,0])
sns.histplot(data=train_df, x='Weight Capacity (kg)', ax=ax[0,1])
sns.histplot(data=train_df, x='Price', ax=ax[0,2])

sns.boxplot(data=train_df, x='Compartments', ax=ax[1,0])
sns.boxplot(data=train_df, x='Weight Capacity (kg)', ax=ax[1,1])
sns.boxplot(data=train_df, x='Price', ax=ax[1,2])

plt.show()

In [None]:
fig, ax = plt.subplots(nrows=len(CATEGORIC_COLUMNS), ncols=1, figsize=(10,18))


for i, category in enumerate(CATEGORIC_COLUMNS):
    sns.countplot(
        data=train_df[[category]],
        x=category,
        ax=ax[i]
    )

In [None]:
correlation = train_df[NUMERIC_COLUMNS+['Compartments']].corr()

mask = np.triu(np.ones_like(correlation, dtype=bool))

fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(9,9))

cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(
    correlation,
    mask=mask,
    cmap=cmap,
    annot=True,
    square=True, 
    ax=ax
)

In [None]:
# Nan Handling
# for the time being, drop, reinvestigate others later

print(train_df.isna().sum().values)
train_df = train_df.dropna()
print(train_df.isna().sum().values)

In [None]:
# feature engineering
# add combinations of categories
# or transform numerics in categories/bins


In [None]:

target = train_df[TARGET_COLUMN]
train = train_df.drop(columns=TARGET_COLUMN)
test = test_df

In [None]:
train

In [None]:
transformer = ColumnTransformer(
    transformers=[
        ('categories', OneHotEncoder(sparse_output=False), CATEGORIC_COLUMNS),
        ('weight', MinMaxScaler(), ['Weight Capacity (kg)']),
    ], remainder='passthrough'
)

pipe = Pipeline(
    steps=[
        ('transform_columns', transformer),
        ('regression', RandomForestRegressor())
        ]
        )


In [None]:
cv_results_tree_regressor = cross_validate(
    pipe,
    train,
    target,
    cv=KFold(n_splits=10, shuffle=True, random_state=42),
    scoring="neg_root_mean_squared_error",
    n_jobs=6
)

errors_tree_regressor = pd.Series(
    -cv_results_tree_regressor["test_score"], name="Decision tree regressor"
)
errors_tree_regressor.describe()

In [None]:
from sklearn.model_selection import GridSearchCV

cv_search = GridSearchCV(
    estimator = pipe,
    param_grid={
        'regression__n_estimators':[100],
        'regression__criterion':['squared_error', 'friedman_mse', 'poisson'],
    },
    scoring="neg_root_mean_squared_error",
    n_jobs=3,
)

search_results = cv_search.fit(
    train,
    target
)

In [None]:
sub_df = pd.DataFrame(
    index=test.index,
    data={
        'num_sold':cv_search.predict(test)
    }
)
sub_df    

In [None]:
if KAGGLE_RUN:
    sub_df.to_csv("/kaggle/working/submission.csv")
    !head /kaggle/working/submission.csv