# <font color='#9575cd'>1. Importing Packages and Data</font> 

### <font color="#3396c">Importing Packages</font>

In [None]:
import os
import gc
import warnings

import datetime

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_style('whitegrid')

### <font color="#3396c">Notebook Constants</font>

In [5]:
folderPath = 'D:/Competitions/Data/EloMerchant'

In [6]:
train = pd.DataFrame();
test = pd.DataFrame();
data = pd.DataFrame();
hist_transactions = pd.DataFrame();

### <font color="#3396c">Loading Data</font>

In [3]:
train = pd.read_csv(os.path.join(folderPath,'train.csv.zip'),compression='zip')

In [4]:
test = pd.read_csv(os.path.join(folderPath,'test.csv.zip'),compression='zip')

In [None]:
hist_transactions = pd.read_csv(os.path.join(folderPath,'historical_transactions.csv.zip'),compression='zip')

In [7]:
data = pd.concat([train,test])

In [8]:
train = pd.DataFrame();
test = pd.DataFrame();
gc.collect()

14

# <font color='#9575cd'>2. Data Exploration and Manipulation</font>

### <font color="#00b8d4">2.1 Missing value treatment</font>

### <font color="#00b8d4">2.2 Outlier Detection and treatment</font>

### <font color="#00b8d4">2.3 Oversampling the minority class</font>

### <font color="#00b8d4">2.4 Data type assignment</font>

##### <font color="#6497b1">Train and Test</font>

In [None]:
data['first_active_month'] = pd.to_datetime(data['first_active_month'],format='%Y-%m')

In [None]:
data['elapsed_time'] = (datetime.date(2018,2,1) - data['first_active_month'].dt.date).dt.days

In [None]:
for colName in ['feature_1','feature_2','feature_3']:
    data[colName] = data[colName].astype(np.int8)

### <font color="#00b8d4">2.5 Label Exploration</font>

# <font color='#9575cd'>3. Feature Engineering</font>

### <font color="#006064">3.1. Creating Features</font>

In [20]:
train['started'] = train['first_active_month'].apply(lambda x:x.year*100 + x.month)
test['started'] = test['first_active_month'].apply(lambda x:x.year*100 + x.month)

In [23]:
test['started'].fillna(test['started'].max(),inplace=True)

In [29]:
currentdate = pd.datetime.now().year * 100 + pd.datetime.now().month
train['started'] = currentdate - train['started']
test['started'] = currentdate - test['started']

### <font color="#006064">3.2. Removing Redundant Columns</font>

In [30]:
train.drop('first_active_month',axis=1,inplace=True)
test.drop('first_active_month',axis=1,inplace=True)

# <font color='#9575cd'>4. Machine Learning Modelling</font>

### <font color="#006064">4.1. Data Pre-processing</font>

##### <font color="#0097a7">4.1.1 Imputation</font>

##### <font color="#0097a7">4.1.2 Labelling of Categorical variables</font>

##### <font color="#0097a7">4.1.3 One Hot Encoder</font>

##### <font color="#0097a7">4.1.4 Scaling</font>

### <font color="#006064">4.2 Creating Baseline Model </font>

In [31]:
feature = ['feature_1','feature_2','feature_3','started']
target = 'target'

In [32]:
from sklearn.ensemble import RandomForestRegressor

In [33]:
regr = RandomForestRegressor(max_depth=10,random_state = 50,n_estimators = 50)

In [35]:
regr.fit(train[feature],train[target])

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=1,
           oob_score=False, random_state=50, verbose=0, warm_start=False)

### <font color="#006064">4.3 Model Selection</font>

### <font color="#006064">4.4 Model Optimization</font>

### <font color="#006064">4.5 Implementation</font>

# <font color='#9575cd'>5. Investigating Predictions/Evaluation</font>

# <font color='#9575cd'>6. Submission</font>

In [36]:
test[target] = regr.predict(test[feature])

In [54]:
test.loc[:,['card_id','target']].to_csv(os.path.join(folderPath,f'submission_rfr_{pd.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")}.csv'),index=False)