In [None]:
from typing import List
import pandas as pd
import scipy as sp
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

random_state = 20

response = 'sales'

categorical = [
    'productID',
    'brandID',
    'weekday',
    'attribute1',
]

numerical = [
    'attribute2',
    'clickVolume',
    'avgOriginalUnitPrice',
    'avgFinalUnitPrice',
    'ma14SalesVolume',
    'meanAge',
    'gender',
    'meanEducation',
    'maritalStatus',
    'plus',
    'meanPurchasePower',
    'meanUserLevel',
    'meanCityLevel',
    # 'sales',
]

class Preprocessor:
    def __init__(self, encode_categorical, scale_numeric):
        self.encode_categorical = encode_categorical
        self.scale_numeric = scale_numeric
        self.encoder = OneHotEncoder(drop='first', sparse=False)
        self.sscaler = StandardScaler(with_mean=False, with_std=True)
    
    def fit(
        self, 
        data: pd.DataFrame, 
        categorical: List[str] = categorical, 
        numerical: List[str] = numerical,
    ):
        self.encoder.fit(data[categorical].apply(lambda x: x.astype('int')))
        self.sscaler.fit(data[numerical])
        return self
    
    def transform(
        self,
        data: pd.DataFrame, 
        categorical: List[str] = categorical, 
        numerical: List[str] = numerical,
    ):
        # To be safe
        data = data.copy()
        
        # Cast data types
        data[categorical] = data[categorical].apply(lambda x: x.astype('category'))
        data[numerical] = data[numerical].apply(lambda x: x.astype('float'))

        # Append dummies as new columns
        if self.encode_categorical:
            # data = pd.get_dummies(data, drop_first=False)
            encoded = self.encoder.transform(data[categorical])
            columns = self.encoder.get_feature_names(categorical)
            data = data.drop(columns=categorical)
            data[columns] = encoded
            # data = pd.concat([
            #     data.drop(columns=categorical),
            #     pd.DataFrame(encoded, columns=columns)
            # ], axis=1)
        
        # handle numerical data
        if self.scale_numeric:
            data[numerical] = self.sscaler.transform(data[numerical])

        return data
        
def load_datasets(train_file, test_file):
    """Utility function to load and preprocess standardized data"""
    # Split train-val-test split
    train_val = pd.read_csv(train_file, index_col=0)
    test = pd.read_csv(test_file, index_col=0)
    train, validation = train_test_split(train_val, test_size=0.2, random_state=random_state)
    
    # Fit preprocessor to train data only
    preprocessor = Preprocessor(encode_categorical=True, scale_numeric=True)
    preprocessor.fit(train)
    
    # Ensure all datasets undergo the same preprocessing steps
    train = preprocessor.transform(train)
    validation = preprocessor.transform(validation)
    test = preprocessor.transform(test)
    
    return train, validation, test

# Check that preprocessing steps are reproducible
foo, *_ = load_datasets(
    train_file = '../input/datatrain/Data-train.csv', 
    test_file = '../input/datatest/Data-test.csv', 
)

bar, *_ = load_datasets(
    train_file = '../input/datatrain/Data-train.csv', 
    test_file = '../input/datatest/Data-test.csv', 
)

assert all(foo == bar), 'Preprocessing not reproducible!'

# Module exports
train, validation, test = load_datasets(
    train_file = '../input/datatrain/Data-train.csv', 
    test_file = '../input/datatest/Data-test.csv', 
)

y = response
X = train.columns.drop(response)

#Further dividing the train dataset in 80-20 split, so we dont touch validation till the end
# def data_process():
#Seperating response and parameters
DTy = train.sales
DTX = train.drop(['sales'], axis=1)

#Splitting data in training and test dataset
DTX_train,DTX_test,DTy_train,DTy_test = train_test_split(DTX,DTy, test_size=0.2,random_state=random_state)

# return DTX_train,DTX_test,DTy_train,DTy_test


In [None]:
#Random Forest Regressor using grid search
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

RF = RandomForestRegressor()
RF.fit(DTX_train,DTy_train)

In [None]:
#Grid Search
from sklearn.model_selection import GridSearchCV

parameters = {'n_estimators' : (10,30,50,70,90,100)
              , 'criterion' : ('mse','mae')
              , 'max_depth' : (3,5,7,9,10)
              , 'max_features' : (None,'auto','sqrt')
              , 'min_samples_split' : (2,4,6)
              , 'ccp_alpha' : (0.01,0.02,0.03,0.04)
             }

RF_grid = GridSearchCV(RandomForestRegressor(), scoring = 'neg_mean_squared_error',param_grid = parameters,cv = 5)

RF_grid_model = RF_grid.fit(DTX_train,DTy_train)

In [None]:
RF_grid_model.best_estimator_

In [None]:
RF_grid_model.best_score_

In [None]:
RF = RandomForestRegressor(max_depth=10, min_samples_split=4, n_estimators=50)
# RF = RandomForestRegressor(max_depth=7, min_samples_split=4, n_estimators=50,n_jobs=-1)
RF.fit(DTX_train,DTy_train)

In [None]:
#Evaluation time

y_pred = RF.predict(DTX_test)
print(f'Test : {mse(y_pred,DTy_test):.3f}')
# print(f'Train : {mse(y_pred,DTy_train):.3f}')