### Load Libraries & Datasets

#### Libraries

In [13]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import seaborn as sns
import sklearn.preprocessing as preprocessing
import statsmodels.api as sm
import sys

from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN


src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# helper functions 
from d03_processing.feature_engineering import process_data_for_model_building
from d04_modelling.modelling import get_model_pvalue

# Load the "autoreload" extension
%load_ext autoreload

# reload modules so that as you change code in src, it gets loaded
%autoreload

%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Datasets

In [14]:
gasoline_proc = pd.read_csv('../../data/03_processed/gasoline_processed.csv')
astm = pd.read_csv('../../data/01_raw/ASTM_fuel.csv')
astm.columns = ['Date', 'TN_retailers_seasons', 'TN_distributor_seasons',
       'vapor_liquid_minC_retail', 'distillation_50_minC _retail',
       'distillation_50_maxC_retail', 'vapor_pressure_maxC_retail',
       'vapor_liquid_minC_dist', 'distillation_50_minC_dist',
       'distillation_50_maxC_dist', 'vapor_pressure_maxC_dist']

#### Process Datasets

The helper functions for cleaning/ processing the dataset can be found in the src/d03_processing folder

In [15]:
gasoline = process_data_for_model_building(gasoline_proc, astm)

In [16]:
gasoline.columns

Index(['Sample', 'prod', 'datesampled', 'grade', 'supplier', 'facilityname',
       'siteaddress', 'units_dist_50', 'units_vap_pressure',
       'units_vap_liq_pressure', 'method_dist_50', 'method_vap_pressure',
       'method_vap_liq_pressure', 'result_dist_50', 'result_vap_pressure',
       'result_vap_liq_pressure', 'minresults_dist_50',
       'minresults_vap_pressure', 'minresults_vap_liq_pressure',
       'maxresults_dist_50', 'maxresults_vap_pressure',
       'maxresults_vap_liq_pressure', 'compliance_dist_50',
       'compliance_vap_pressure', 'compliance_vap_liq_pressure',
       'siteaddress_city', 'datesampled_month', 'datesampled_day', 'Date',
       'TN_retailers_seasons', 'TN_distributor_seasons',
       'vapor_liquid_minC_retail', 'distillation_50_minC _retail',
       'distillation_50_maxC_retail', 'vapor_pressure_maxC_retail',
       'vapor_liquid_minC_dist', 'distillation_50_minC_dist',
       'distillation_50_maxC_dist', 'vapor_pressure_maxC_dist'],
      dtype='obje

### Build Model

#### StatsModel (for pvalues) 

In [17]:
# construct features 
x_feats = ['TN_retailers_seasons', 'grade']
X = pd.get_dummies(gasoline[x_feats], dtype=float)
X = sm.tools.add_constant(X)
# convert target using get_dummies
y = pd.get_dummies(gasoline["compliance_vap_liq_pressure"], dtype=float)

In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y.iloc[:,0], test_size=0.3, random_state=0)

In [19]:
smote = SMOTE()

# simple resampling from your previously split data
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train.ravel())

def get_model_pvalue(y_train, X_train):
    logit_model = sm.Logit(y_train_resampled, X_train_resampled)
    result = logit_model.fit()
    return print('\n PValues for model parameters: \n', result.pvalues)

In [20]:
get_model_pvalue(y_train_resampled, X_train_resampled)

         Current function value: 0.422578
         Iterations: 35

 PValues for model parameters: 
 [0.99998995 0.97766845 0.99889617 0.97510694 0.97323902 0.99910215
 0.97375732 0.9988923  0.99880831 0.99999982 0.99999938 0.99998775
 0.9999986 ]




### Test Model