# Import modules

In [100]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# 1. Get the data

## 1.1 Download the data
In order to obtain the data I use the kaggle API, in the following link there is a guide to initial the kaggle API - [kaggle API](https://github.com/Kaggle/kaggle-api).


In [6]:

"""
Get the data with kaggle API
"""

filename: str = 'tabular-playground-series-aug-2021'
    
def get_data_from_kaggle_with_API(filename):
    """
    this function checks if you have downloaded the data to the current path.
    If yes it checks if the data is unzipped and the 'test.csv' file exists.
    If no it unzips the downloaded zip data and then delete the zipped file from the current path 
    """
    if not os.path.isfile(filename + '.zip') and not os.path.isfile('test.csv'):
        os.system("kaggle competitions download -c " + filename)

    if not os.path.isfile('test.csv') and os.path.isfile(filename + '.zip'):
        os.system('unzip ' + filename + '.zip')
        os.system('rm tabular-playground-series-aug-2021.zip')

    if os.path.isfile('test.csv') and os.path.isfile(filename + '.zip'):
        os.system('rm tabular-playground-series-aug-2021.zip')

get_data_from_kaggle_with_API(filename)

## 1.2 Load the data to pandas dataframe

In [7]:
df_train: pd.DataFrame = pd.read_csv('train.csv')  # read data

## 1.3 Split to features(X) and lable (y)

In [87]:
data_train: np.ndarray = df_train.values
X: np.ndarray = data_train[:, 1:-1]
y: np.ndarray = data_train[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=217)

# 2. EDA

## 2.1 features type

In [88]:
df_train.info(verbose = True, null_counts = False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 102 columns):
 #   Column  Dtype  
---  ------  -----  
 0   id      int64  
 1   f0      float64
 2   f1      int64  
 3   f2      float64
 4   f3      float64
 5   f4      float64
 6   f5      float64
 7   f6      float64
 8   f7      float64
 9   f8      float64
 10  f9      float64
 11  f10     float64
 12  f11     float64
 13  f12     float64
 14  f13     float64
 15  f14     float64
 16  f15     float64
 17  f16     int64  
 18  f17     float64
 19  f18     float64
 20  f19     float64
 21  f20     float64
 22  f21     float64
 23  f22     float64
 24  f23     float64
 25  f24     float64
 26  f25     float64
 27  f26     float64
 28  f27     int64  
 29  f28     float64
 30  f29     float64
 31  f30     float64
 32  f31     float64
 33  f32     float64
 34  f33     float64
 35  f34     float64
 36  f35     float64
 37  f36     float64
 38  f37     float64
 39  f38     float64
 40  f

All the features are numeric

## 2.2

In [89]:
df_train.isna().sum().sum()

0

That is unpopular way to check NaN values but it work fast.

There aren't any NaN Values

## 2.3 Correlation

All the features are continues so I checked pearson correaltion 

The pearson correlation between $X_i$ and $X_j$ is:

$ \rho_{i,j} = \frac{\sum(X_i-\bar{X_i})(X_j-\bar{X_j})}{\sqrt{\sum(X_i-\bar{X_i})^2}\sqrt{\sum(X_j-\bar{X_j})^2}}$

In [90]:
corr = df_train.iloc[:,1:-1].corr(method='pearson')

any(abs(corr.where(corr > 0.2)).sum() > 1)



False

There aren't any two features s.t $ |\rho_{i,j}| > 0.2$ 

# 3. Models

In this section I'll addapt 2 machine learning algorithms:

1) Random forest

2) Ridge regression

## 3.1 Random forest

In this sub section I'll create a grid of hyperparameters and use $4Fold-CV$ to optimize/tune the hyperparameters

In [92]:

"""
Random Forest Model
"""

# Number of features to consider at every split
max_features = [int(x) for x in np.linspace(start=3, stop=99, num=10)]
# Minimum number of samples required to split a node
min_samples_split = [5, 10]



# Create the grid
grid = {'max_features': max_features,
        'min_samples_split': min_samples_split,
        }

# define model
rf = RandomForestRegressor()
# define search
grid_search_rf = GridSearchCV(estimator=rf,
                            param_grid=grid,
                            scoring='neg_root_mean_squared_error',
                            cv=3,
                            verbose=2,
                            n_jobs=-1)
# perform the search
results = grid_search_rf.fit(X_train, y_train)
"""
 Get the paramters that gives the best RMSE
"""

best_model_rf = grid_search_rf.best_estimator_
best_model_rf.fit(X_train, y_train)

loss_rf = best_model_rf.predict(X_test)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


# 3.2 Ridge regression

In [106]:
ridge = Ridge()
grid = dict()
grid['alpha'] = [x for x in np.linspace(start=0, stop=4, num=100)]

grid_search_ridge = GridSearchCV(estimator=ridge,
                            param_grid=grid,
                            scoring='neg_root_mean_squared_error',
                            cv=3,
                            verbose=2,
                            n_jobs=-1)

results = grid_search_ridge.fit(X_train, y_train)
best_model_ridge = grid_search_ridge.best_estimator_
best_model_ridge.fit(X_train, y_train)

loss_ridge = best_model_ridge.predict(X_test)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


  overwrite_a=True).T
  overwrite_a=True).T


# 4. Models evaluation
In this section I will evaluate each model with some different scores.

And then I'll choose the best model

In [107]:
MSE_rf =  mean_squared_error(loss_rf,y_test)
R2_rf = r2_score(loss_rf,y_test)

MSE_ridge = mean_squared_error(loss_ridge,y_test)
R2_ridge = r2_score(loss_ridge,y_test)

print("Random Forest MSE: ", MSE_rf)
print("Random Forest R2: ", R2_rf)

print("ridge MSE: ", MSE_ridge)
print("ridge R2: ", R2_ridge)

Random Forest MSE:  63.065875557639544
Random Forest R2:  -71.3535399282651
ridge MSE:  62.35700825673176
ridge R2:  -79.38128478119711


# 5. Provide predictions with the best model

In [109]:
"""
Predictions to test set
"""


df_test: pd.DataFrame = pd.read_csv('test.csv')  # read data

data_test: np.ndarray = df_test.values
X_test: np.ndarray = data_test[:, 1:]  
    
loss_ridge = best_model_ridge.predict(X_test)
prediction: pd.DataFrame = pd.DataFrame({
    'id': df_test.values[:, 0].astype(int),
    'loss': loss_ridge
})

prediction.to_csv('prediction.csv', index=False)

In [110]:
prediction.head()

Unnamed: 0,id,loss
0,250000,7.353264
1,250001,5.366966
2,250002,7.727362
3,250003,6.62115
4,250004,7.402665
