In [None]:
!pip install pycaret

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', 999)
import matplotlib.pyplot as plt
import seaborn as sns

from pycaret.regression import setup, compare_models, blend_models, finalize_model, predict_model, plot_model

import time

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/train.csv')
train_df.head()

In [None]:
test_df = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/test.csv')
test_df.head()

In [None]:
sub = pd.read_csv('/kaggle/input/tabular-playground-series-aug-2021/sample_submission.csv')
sub.head()

In [None]:
train_df.sample(5)

In [None]:
train_df = train_df.set_index('id') #setting ID as index
X = train_df.drop(columns = ['loss']) # creating X dataframe
y = train_df[['loss']] # creating y

test_df = test_df.set_index('id') #setting ID as index


In [None]:
print("# of records where no loss occurred", len(train_df[train_df.loss == 0])/len(train_df)*100)
print("# of records where loss occurred", len(train_df[train_df.loss != 0])/len(train_df)*100)

In [None]:
train_df.loss.value_counts(dropna=False)

In [None]:
sns.histplot(y)

**y variable**
* 25% of the records incurred no losses, while 75% did
* This might be something we could explicitly add to our model, as a binary variable in order to enhance the differentiation between loss & no-loss
* Loss is also skewed to the left, we could address this distributions as well
* our y variable is also an integer, so loss represents a specific number between 0 and 42

In [None]:
target_cols = X.columns
fig, ax = plt.subplots(int(len(target_cols)/4), 4, figsize=(12, 48))

row1=0
row2=0
row3=0
row4=0
for var in enumerate(target_cols):
    if var[0] < int(len(target_cols)/4):
        sns.histplot(X[var[1]], ax = ax[row1, 0])
        row1+=1
    elif var[0] < int(len(target_cols)/4)*2:
        sns.histplot(X[var[1]], ax = ax[row2, 1])
        row2+=1
    elif var[0] < int(len(target_cols)/4)*3:
        sns.histplot(X[var[1]], ax = ax[row3, 2])
        row3+=1
    else:
        sns.histplot(X[var[1]], ax = ax[row4, 3])
        row4+=1

fig.tight_layout()
plt.show()


**Feature Distributions**
* There are several features with distributions that are not normally distributed. 
* This includes skewed data as well as multiple-peak distributions.
* Standardizing the data is not as helpful when distributions are not gaussian/normal.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
# Creating a baseline model - linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train, y_train)
reg.score(X_train, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

y_pred_base = reg.predict(X_test)
y_pred_base_rnd = y_pred_base.round() # rounding the findings to match loss integer format
print(y_pred_base_rnd)
print(mean_squared_error(y_test, y_pred_base_rnd, squared=False))

**Baseline Model**
* The R^2 is significantly low, at 0.01, meaning our model doesn't do a great job an capturing the relationship between our X variables and y
* This is expected given the previous distribution analysis, since we're looking at non-linear relationships
* RMSE - Root Mean Squared Error for our baseline model is 7.9 - this will be our baseline evaluation metric moving forward


In [None]:
# from scipy import stats

# shapiro_scores = {}
# for var in target_cols:
#     score = stats.shapiro(X[var])
#     shapiro_scores[var] = score[0]
    
# shapiro_df = pd.DataFrame.from_dict(shapiro_scores, orient='index').reset_index()
# shapiro_df.columns = ['feature', 'shapiro_score']

# shapiro_df['test_result'] = np.where(shapiro_df.shapiro_score <= 0.05, 'Normal', 'Non-Normal')

# shapiro_df.test_result.value_counts(dropna=False) # none of the features are normally distributed

In [None]:
def pycaret_model(train, target, test, n_select, fold, opt):
  print('Setup Your Data....')
  setup(data=train,
              target=target,
              numeric_imputation = 'mean',
              silent= True)
  
  print('Comparing Models....')
  best = compare_models(sort=opt, n_select=n_select, fold = fold, exclude = ['xgboost'])

  print('Here is Best Model Feature Importances!')
  plot_model(estimator = best[0], plot = 'feature')
  time.sleep(5)
  
  print('Blending Models....')
  blended = blend_models(estimator_list= best, fold=fold, optimize=opt)
  pred_holdout = predict_model(blended)
    
  print('Finallizing Models....')
  final_model = finalize_model(blended)
  print('Done...!!!')

  pred_esb = predict_model(final_model, test)
  re = pred_esb['Label']

  return re

In [None]:
sub['loss'] = np.exp(pycaret_model(train_df, 'loss', test_df, 5, 3, 'RMSLE'))-1

In [None]:
sub.to_csv('submission.csv', index=False)