## Contents

1. [Purpose](#purpose)
2. [EDA](#eda)
3. [Model-building](#model)

<a id='purpose'></a>

### Purpose

The purpose of this notebook is to serve as a template for quick EDA and Model-building.

In [None]:
from __future__ import division, print_function, absolute_import

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import re
import logging
import warnings
import dill # to save an instance of your notebook including any built models

pd.set_option('display.float_format', lambda x: '%.3f' % x) # to surpress floating point numbers being displayed 
# in scientific notation when unnecessary.

%config Application.log_level = "ERROR"

warnings.filterwarnings(action='once') # to stop warnings from showing repeatedly 

def snakify(column_name):
    '''
    Function to convert pandas column names into snake case.
    '''
    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', column_name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).lower()


data = pd.read_csv("data.csv") # create a pandas DataFrame from your data
data.columns = [snakify(col) for col in data.columns]
key = '' # if you have keys like address_id, customer_id, you should change to object from int etc.
data[key] = data[key].astype("object")

data.head()

<a id='eda'></a>

### Exploratory Data Analysis

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
# checking how pervasive nulls are in each of the variables...

counter = 0

for col in data.columns:
    if int(round(1 - (data[col].isnull().sum() / len(data)), 2) * 100) != 100:
        print(col, 'has',
              int(round(1 - (data[col].isnull().sum() / len(data)), 2) * 100),
              'per cent coverage.')
        counter+=1
    else:
        continue
        
if counter == 0:
    print('All features are at 100% coverage.')

In [None]:
# checking for zero-heavy variables...

for col in data.columns:
    if int(round(len(data[col][data[col] == 0]) / len(data) * 100, 2)) != 0:
        print(round(len(data[col][data[col] == 0]) / len(data) * 100, 2),
              'per cent of', col, 'are zeroes')

In [None]:
target = ''

not_used = []
features = [col for col in data.columns if col not in [key, target] and col not in not_used]

In [None]:
samp = 0.05 # 5 percent of whole data

for feat in features:
    data[feat].sample(samp * len(data)).boxplot(figsize=(10, 6))
    plt.show()

In [None]:
# plotting histograms of each of the variables...

data[features].sample(samp * len(data)).hist(figsize=(10, 8), bins=50)
plt.show()

In [None]:
# create pairplots of the data...

lst = [f for f in features]
if target not in lst:
    lst.append(target)
corr = data[lst].corr()
print(corr[target][:-1].sort_values(ascending=False))

sb.pairplot(data[lst].sample(samp * len(data)))
plt.show()

In [None]:
# plotting a heatmap of variable correlations...

cmap = sb.diverging_palette(250, 10, as_cmap=True)
fig = plt.figure(figsize=(15, 15))

sb.heatmap(corr,
           cmap=cmap,
           annot=True,
           xticklabels=corr.columns,
           yticklabels=corr.columns,
           linewidths=.25)

plt.show()

<a id='model'></a>

### Model-Building

In [None]:
# simple linear regression and feature scaling
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler # use robust to better handle outliers
from sklearn.metrics import mean_squared_error, mean_absolute_error

# if you're building a basic neural network...
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, BatchNormalization


def mean_absolute_percentage_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100


def mean_bias_error(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(y_true - y_pred)