# Prerequisites

## Loading Libraries

In [3]:
pip install voila



You should consider upgrading via the 'd:\github\personal_nb\scripts\python.exe -m pip install --upgrade pip' command.





In [44]:
!pip install ml-package

Collecting ml-package
  Using cached ml_package-0.0.8.tar.gz (3.6 kB)
Using legacy setup.py install for ml-package, since package 'wheel' is not installed.
Installing collected packages: ml-package
    Running setup.py install for ml-package: started
    Running setup.py install for ml-package: finished with status 'done'
Successfully installed ml-package-0.0.8


You should consider upgrading via the 'd:\github\personal_nb\scripts\python.exe -m pip install --upgrade pip' command.


In [1]:
import ml_package as ml
from ml_package import pd

In [1]:
# # Basic

# import warnings
# warnings.filterwarnings('ignore') # supress warnings

import logging
logger = logging.getLogger() # Used for creating logs outside a notebook
logger.setLevel(logging.CRITICAL)

import random
random.seed(4) # Creating a random seed

from pytz import timezone
india_timezone = timezone('Asia/Kolkata') # Getting countries timezone

import os

# # Markdown

# from IPython.display import HTML, display, Markdown, clear_output


# # Data Manipulation and display

# import pandas as pd
# pd.set_option('display.max_columns', None) # Displaying all the columns in the dataset
# pd.set_option('max_colwidth', None) # Displaying entire contents of columns
# pd.set_option("max_rows", 100) # Displaying all rows
# pd.set_option('max_seq_item', None) # Displaying everything in the list in the dataframe
# pd.set_option('precision', 2) # Displaying precision to just 2 decimal places

import numpy as np

# Modelling

from sklearn.ensemble import RandomForestClassifier # Random forest for classification

from boruta import BorutaPy # Boruta for feature selection


# Graph generation

import plotly.graph_objects as go

from plotly.subplots import make_subplots # Subplots are a form of stacked plots one over the other

## Loading dataset

In [26]:
# Loading titanic dataset
dataset = ml.read(path = "datasets/iris.csv",sep = ',', col = ['PetalWidthCm','Species'],
                  row = 100, type = {'PetalWidthCm': str})

In [45]:
dataset = pd.read_csv("datasets/iris.csv")
dataset = dataset.drop(['Id'], axis = 1)

# Markdown

## Printing basic markdown text with python code

In [None]:
Markdown('<span style="color:red; font-style: italic; font-size: 25px">Test for <b>Markdown</b> code</span>')

## Clearing out code

In [47]:
display(Markdown('<span style="color:red; font-style: italic; font-size: 25px">Test for <b>Markdown</b> code</spyan>'))
ml.clear_output()

## Creating a loading icon

In [4]:
ml.load_icon()

<div><div class="loader"></div><h3>&nbsp;LOADING</h3></div>

## Creating a button to hide all codes

In [3]:
ml.hide_code()

## Printing dataset using plotly

In [31]:
def display_data(data_table):
    data_table_series = [data_table[i] for i in data_table.columns]

    fig = go.Figure(data=[go.Table(
        header=dict(values=list(data_table.columns),
                    fill_color='black',
                    align='center',
                    font=dict(color='white', size=15)
                   ),
        cells=dict(values=data_table_series,
                   fill_color='#E3E3E3',
                   align='center',
                   font=dict(color='black', size=10)
                  ))
    ])

    
    fig.update_layout(width=150*len(data_table.columns), 
                      height=500,
                      margin=dict(l=0,r=0,b=0,t=0,pad=0))
    fig.show()

display_data(dataset)

# Data Manipulation

# Graph generation

## Loading Libraries

# Feature Engineering

# Feature Selection

## Boruta - Feature Selection

In [None]:
ohe_columns = ['Embarked', 'Pclass', 'Parch', 'Cabin', 'Sex', 'SibSp']  # Categorical variables
# encoding categorical variables
dataset_Boruta = pd.get_dummies(data=dataset,
                             columns=ohe_columns,
                             drop_first=False, # drop_first = False will create categoreis for all the categories
                             dummy_na=True) # dummy_na = True will treat na as a category

# Selecting training features
features = [f for f in dataset_Boruta.columns if f not in ['PassengerId', 'Survived', 'Name', 'Ticket']]

# Replacing NA's with mean
dataset_Boruta[features] = dataset_Boruta[features].fillna(dataset_Boruta[features].mean()).clip(-1e9,1e9)

# Creating final dataset
X = dataset_Boruta[features].values
Y = dataset_Boruta['Survived'].values.ravel()

# Using RandomForrestClassifier where max_depth of the tree is advised on the Boruta Github page to be between 3 to 7
rf = RandomForestClassifier(n_jobs=-1, class_weight='balanced', max_depth=5)

# Running feature selector with max_iterations = 50 and threshold of acceptance to be 90%
boruta_feature_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=4242, max_iter = 50, perc = 90)
boruta_feature_selector.fit(X, Y)

# Filtering dataset for selected features
X_filtered = boruta_feature_selector.transform(X)

# Printing names of final features
final_features = list()
indexes = np.where(boruta_feature_selector.support_ == True)
for x in np.nditer(indexes):
    final_features.append(features[x])
print(final_features)

# Modelling

## Treating Categorical Data

### Label Encoding

In [46]:
le = ml.le()
le.fit(dataset['Species'])
dataset['Species'] = le.transform(dataset['Species'])
dataset['Species'] = le.inverse_transform(dataset['Species'])

### One Hot Encoding

In [41]:
dataset = ml.ohe(dataset, ['Species'])

## Splitting the dataset into train-test split

In [50]:
x_train_array = dataset[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
y_train_array = dataset[['Species']]

In [51]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train_array, y_train_array, test_size=0.20, random_state=5)

## Normalizing the dataset

In [54]:
x_train_scaled, x_test_scaled = ml.norm(x_train, [x_test])

## Model Building

### Logistic Regression

Why linear regression cannot be used for classification or why logistic regression?

Linear regression is very sensitive to outliers and cause huge inaccuracies, so we cannot classify using a straight line and logistic regression comes into picture. Also probabilities given by linear regression are above 1 and below zero.

Logistic regression solves this problem by limiting the probabilities between 0 and 1 and also reducing the outlier sensitivity by introducing a sigmoid function.

RMSE is not applicable in Logistic Regression and analysis the rediuals is not the same.
Model is fit using MLE(Maximum Likelihood Estimation) which finds the solution such that estimated log odds best describes observed outcome. It involves a quasi-Newton optimization that iterates between scoring step(fisher scoring). Optimization is done on the function deviance(-2*log()))


https://www.youtube.com/watch?v=uFfsSgQgerw&ab_channel=KrishNaik

Wald's test for dropping columns that are not useful in the analysis



**Parameters**

solver = {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’, ‘saga’}, default=’lbfgs’

In [None]:
penalty = {'l1’, ‘l2’, ‘elasticnet’, ‘none’}, default=’l2’


In [100]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(multi_class='multinomial',solver = 'lbfgs')

lr_model = model.fit(x_train_scaled, y_train)

#Once the model is trained, it’s ready to make predictions
y_pred = lr_model.predict(x_test_scaled)

from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)*100
accuracy

90.0

In [None]:
XGBoost

https://www.youtube.com/watch?v=GrJP9FLV3FE&ab_channel=StatQuestwithJoshStarmer&t=2632s
    
https://www.youtube.com/watch?v=Nol1hVtLOSg&ab_channel=KrishNaik&t=19s

# Creating an HTML of current notebook

In [21]:
%%javascript
IPython.notebook.kernel.execute(`notebook_name = '${IPython.notebook.notebook_name}'`);

<IPython.core.display.Javascript object>

In [22]:
display(Markdown('<div><div class="loader"></div><h3>&nbsp;LOADING</h3></div>'))
notebook_path = os.getcwd()+'\{}'.format(notebook_name)
x = os.system('jupyter nbconvert "{}" --no-input --no-prompt --template toc2 --to=html'.format(notebook_path))
if x != 0:
    x = os.system('jupyter nbconvert "{}" --no-input --no-prompt --to=html'.format(notebook_path))

clear_output()
display(Markdown('<span style="color:black; font-size: 15px"><b><i>HTML Report is'+
                 'saved in your current local folder!</i></b></span>'))

<span style="color:black; font-size: 15px"><b><i>HTML Report issaved in your current local folder!</i></b></span>