In [None]:
# Dataset  Overview
- Importing, Analyzing & Understanding the Data/Importing the input data files(s)
- Initial data frame overview
- Null-value calculation and review
- Duplicate Analysis and review

# Data Cleaning
- Delete high-null features
- Categorical & Numerical segregation
- Categorical/Numerical features null-value treatment

# Exploratory data analysis (EDA)
- Data imbalance analysis
- Categorical/Numerical features Univariate analysis
- Categorical/Numerical features Bivariate analysis
- Output class disbalance analisys (for classification scenarios)

# Data Preparation Techniques
- Outliers analysis and treatment
- Binary categories treatment
- Dummy variables substitution
- Features scaling/normalization
- Feature construction
    - composing
    - decomposing
- Binning
- Log transform
- Grouping operations (based on the granularity level)
- Train-Test splitting

In [2]:
# Data Analysis
import numpy as np
import pandas as pd
from collections import Counter

# Data Visualization
import matplotlib.pyplot as plt
import matplotlib.style as style
%matplotlib inline

import seaborn as sns

# Plot Style
sns.set_context("paper")
style.use('fivethirtyeight')

#Sci-kit learn libraries
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Performance Evaluation/Metrics
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score

In [3]:
#statmodel libraries
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

In [4]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
pd.set_option('display.max_columns', 80)
pd.set_option('display.max_rows', 80)

In [6]:
#================================================================

In [7]:
# Importing the Input file
# Local path to the file
data = pd.read_csv("fileName.csv")

In [None]:
# Number of columns
len(data.columns)

# Data shape
print("Data dimension:",lead.shape)

In [None]:
# Additional data shape analysis
data.info()
data.describe()

In [None]:
#================================================================

In [None]:
# Persisting processed data
data.to_csv("fileName.csv")
data_processed = pd.read_csv("fileName_Processed.csv")
data_processed.head()

In [None]:
#================================================================

In [None]:
Y = data_processed['targetColumn']
X = data_processed.drop(['targetColumn'], axis=1)

# Splitting the data into train (70%) and test (30%)
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=100)

In [None]:
# Using Redundant Feature Elimination (RFE) to reduce the feature count from 49 to 20
logreg = LogisticRegression()
rfe = RFE(logreg, 20)           
rfe = rfe.fit(X_train, y_train)

In [44]:
# Checking which columns remained after RFE
rfe_col = X_train.columns[rfe.support_]
rfe_col

In [None]:
# Checking which columns were eliminated after RFE
X_train.columns[~rfe.support_]

In [50]:
def build_logistic_model(feature_list):
    X_train_local = X_train[feature_list]
    log_model =  LogisticRegression(solver='liblinear').fit(X_train_local, y_train)
    return(log_model)

In [None]:
# Preparing the features list
features = list(rfe_col)

# Building the regression model
log_model = build_logistic_model(features)

In [None]:
# Utilizing the created model to make predictions using the 'predict_proba' functionality for the train set
# 'predict_proba' generates the probabilities for the target in array form
y_train_pred = log_model.predict_proba(X_train[features])
y_train_pred
y_train_pred[:,[1]]

In [None]:
# Utilizing the created model to make predictions using the 'predict_proba' functionality for the test set
y_test_pred = log_model3.predict_proba(X_test[features])
y_test_pred[:,[1]]

In [None]:
# Model`s performance metrics & evaluation
from sklearn.metrics import confusion_matrix, classification_report

conf_matrix = confusion_matrix(y_test,y_test_pred)
print(classification_report(y_test,y_test_pred))
'\n'
print(conf_matrix)

tn = conf_matrix[0,0]
fp = conf_matrix[0,1]
tp = conf_matrix[1,1]
fn = conf_matrix[1,0]

total = tn + fp + tp + fn
accuracy  = (tp + tn) / total # Accuracy Rate
precision = tp / (tp + fp) # Positive Predictive Value
recall    = tp / (tp + fn) # True Positive Rate

In [None]:
# Test dataset features
X_test[features].columns

In [None]:
#================================================================

In [None]:
# Persisting the created regression model using Pickle
import pickle

with open("leadScoringModelName.pkl", 'wb') as f:
        pickle.dump(log_model, f)
print("Model has been pickled. Run /score to score model.")

In [None]:
# Loading and executing the saved regressor using Pickle 
lead_scoring_model = pickle.load(open("LeadScoringModelName.pkl", 'rb'))
result = lead_scoring_model.score(X_test[features], y_test)
print(result)

In [76]:
#================================================================

In [83]:
# Importing ONNX related packages for converting the Scikit-learn`s model into the OONX model`s format

# Importing ONNX related packages
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType

# Importing ONNX Runtime related package
import onnxruntime as rt

# skl2onnx.get_latest_tested_opset_version()
# skl2onnx.supported_converters(from_sklearn=False)

In [84]:
# Local path for persisting the ONNX model`s format
ONNXModelPath = "leadScoringModel.onnx"

In [85]:
# Defining the input features dimension as FloatTensorType
num_features = 10
initial_type = [('feature_input', FloatTensorType([None, num_features]))]

In [87]:
# Converting the Scikit-learn`s regression model into the ONNX format
onnx = convert_sklearn(lead_scoring_model, initial_types=initial_type)

# Saving the model on the previously defined local path
with open(ONNXModelPath, "wb") as f:
    f.write(onnx.SerializeToString())

In [90]:
# Creating inference session for runtime interaction
session = rt.InferenceSession(ONNXModelPath)

In [None]:
print(session.get_inputs()[0].name)
input_name = session.get_inputs()[0].name

In [None]:
print(session.get_outputs()[0].name)
label_name = session.get_outputs()[0].name

In [118]:
# Executing the session and retrieving the results
pred_onnx = session.run(None, {input_name: X_train[features].values.astype(np.float32)})[1]
pred_onnx

In [None]:
X_train[features].values[0]
X_train[features].values[1]