# Problem: Predict the lowest price
Competition: Hackerearth<br>
Problem Statement: https://www.hackerearth.com/problem/machine-learning/predict-the-lowest-price-8-9ffabe00/<br>
Author: Pinaki Brahma<br>
> Performance: 99.83% accuracy on final submission (Top 5 %ile)<br>
> Methodology: Python based solution. A FastAi approach to solve Tabular Data powered by GPU

# Set Up Envirionment

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Load necessary packages

In [None]:
from fastai.tabular import *
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore")

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

### Load datasets

In [None]:
df = pd.read_csv('/kaggle/input/Train.csv')
X_test = pd.read_csv('/kaggle/input/Test.csv')
df.tail(10)

In [None]:
#### Separate the dependent & the independent variables

In [None]:
X = df.drop(['Low_Cap_Price'], axis = 1)
y = pd.DataFrame(df.Low_Cap_Price)

X.shape, y.shape

#### Feature Design Section

In [None]:
# Date related features are added to the dataset
add_datepart(X, "Date", drop=False)
add_datepart(X_test, "Date", drop=False)

In [None]:
# check for columns
X.columns

### Categorical & Continuous variables

In [None]:
dep_var = ['Low_Cap_Price']
# cont_names, cat_names = cont_cat_split(df=X, max_card=6, dep_var=dep_var)
cont_names = ['Demand', 'High_Cap_Price']
cat_names = ['State_of_Country', 'Market_Category', 'Product_Category', 'Grade', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
dep_var, cont_names, cat_names

### Explicitly Scaling features | if necessary

In [None]:
# from sklearn.preprocessing import StandardScaler

## scale continuous features
# X_cont = X[cont_names]
# print(X_cont.describe())

# sc_X = StandardScaler()
# X_cont = pd.DataFrame(sc_X.fit_transform(X_cont))
# X_test_cont = pd.DataFrame(sc_X.transform(X_test[cont_names]))
# X_cont.columns = cont_names
# X_test_cont.columns = cont_names

# sc_y = StandardScaler()
# y_sc = pd.DataFrame(sc_y.fit_transform(y))
# y_sc.columns = dep_var

# print(X_cont.head())
# print(y_sc[0:5])
# print(X_test_cont.head())

## --- merge categorical features back ---
# X_all = pd.concat([X[cat_names], X_cont], axis = 1) X_test_all = pd.concat([X_test[cat_names], X_test_cont], axis = 1)
# print(X_all.head()) print(X_test_all.head())
# X_all.shape, X_test_all.shape

## --- alternate way to scale only dependent variable ---
# max_price =max(df['Low_Cap_Price'])
# min_price =min(df['Low_Cap_Price'])
# df['Low_Cap_Price'] = df['Low_Cap_Price'].apply(lambda x: (x-min_price)/(max_price-min_price))

### Declare how you want to 
* Handle Missing Values
* Handle Categorical Features
* Normalize Continuous Features<br>

FastAi does this for us.. we need to just mention the procs

In [None]:
# dep_var = 'Low_Cap_Price'
# cat_names = ['State_of_Country', 'Market_Category', 'Product_Category', 'Grade','Month',	'Dayofweek']
# cont_names = ['Demand', 'High_Cap_Price']
procs = [FillMissing, Categorify, Normalize]

In [None]:
#df_all = pd.concat([X_all, y], axis = 1)
df_all = pd.concat([X, y], axis = 1)
X_test_all = X_test
print(df_all.shape)
df_all.head()

### Creating databunch
#### This includes both independent features as well as dependent features
#### This also takes care of pre-processing the validation & the test inputs if included

In [None]:
test = TabularList.from_df(X_test_all, 
                           cat_names = cat_names, 
                           cont_names = cont_names, procs = procs)

data = (TabularList.from_df(df_all, cat_names = cat_names, cont_names = cont_names, procs = procs)
                           .split_by_rand_pct(0.10) # .split_none()
                           .label_from_df(cols = dep_var, label_cls = FloatList, log = True)
                           .add_test(test)
                           .databunch()) #bs = 1024

In [None]:
len(data.test_ds)
len(data.train_ds)

In [None]:
# explore the data
data.show_batch(rows = 5)

### Some Tweaks before Model Training
In regression problems, we can set the max_y to something greater than the current range<br>
Else, y_pred will always lie within the existing range of y<br>

In [None]:
max_y = np.log(np.max(df_all['Low_Cap_Price'])*1.2)
y_range = torch.tensor([0, max_y], device=defaults.device)
y_range

### Initializing Tabular_Learner Model
#### Set necessary parameters including layers, nodes, regularization, etc.

In [None]:
#Initializing the network
learn = tabular_learner(data, layers=[1000,500], y_range=y_range, metrics= exp_rmspe, ps = [0.001, 0.01], emb_drop=0.04, callback_fns=ShowGraph)

In [None]:
learn.lr_find()
learn.recorder.plot(suggestion=True)

In [None]:
lr =1e-3
mod_name="stage1"
# smaller rate with smaller steps
learn.fit_one_cycle(12, max_lr = slice(1e-2), wd = 0.2)

In [None]:
#  higher learn-rate 
#learn.fit_one_cycle(4, lr)

# smaller rate 
#learn.fit_one_cycle(6, lr/20)

# plot losses
learn.recorder.plot_losses()

In [None]:
learn.save('1')

In [None]:
# smaller rate 
#learn.fit_one_cycle(6, lr/20)

In [None]:
#df_final = pd.DataFrame(columns=['Item_Id', 'Low_Cap_Price'])

#for index, row in X_test.iterrows():
 # df_final.loc[index] =[row['Item_Id'],float(learn.predict(row)[1])]


### Using the above model to predict on the test cases
#### Post processing of results like exp(results) are done if log(y) was considered
#### submission output file is updated

In [None]:
test_predictions = learn.get_preds(ds_type=DatasetType.Test)[0]
test_predictions = [i[0] for i in test_predictions.tolist()]
test_predictions = pd.DataFrame(test_predictions, columns =['Low_Cap_Price'])
test_predictions = np.exp(test_predictions)
test_predictions.head()
#predictions.to_excel("Fast_ai_solution.xlsx", index = False)

#preds, _ = learn.get_preds(ds_type=DatasetType.Test) 
#labels = np.argmax(preds, 1)
#test_predictions_direct = [data.classes[int(x)] for x in labels]

In [None]:
submission_file = pd.read_csv('/kaggle/input/Test.csv')
submission_file.head()

In [None]:
submission_file = submission_file[['Item_Id']]
#submission_file.Low_Cap_Price = na
submission_file.head()

In [None]:
submission_file['Low_Cap_Price'] = test_predictions.Low_Cap_Price
submission_file.head()

In [None]:
submission_file.to_csv("hacker-earth_pricePrediction_op_v2.csv", index = False)