In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import sys
import os

import sklearn.preprocessing as preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels as sm
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, ADASYN

src_dir = os.path.join(os.getcwd(), '..', '..', 'src')
sys.path.append(src_dir)

# OPTIONAL: Load the "autoreload" extension so that code can change
%load_ext autoreload

# OPTIONAL: always reload modules so that as you change code in src, it gets loaded
%autoreload 2

In [2]:
gasoline_proc = pd.read_csv('../../data/03_processed/gasoline_processed.csv')
astm = pd.read_csv('../../data/01_raw/ASTM_fuel.csv')
astm.columns = ['Date', 'TN_retailers_seasons', 'TN_distributor_seasons',
       'vapor_liquid_minC_retail', 'distillation_50_minC _retail',
       'distillation_50_maxC_retail', 'vapor_pressure_maxC_retail',
       'vapor_liquid_minC_dist', 'distillation_50_minC_dist',
       'distillation_50_maxC_dist', 'vapor_pressure_maxC_dist']

In [3]:
for col in astm.columns: 
    try:
        astm[col] = astm[col].str.strip()
    except AttributeError: 
        pass

In [4]:
gasoline_proc['datesampled'] = pd.to_datetime(gasoline_proc['datesampled'])

In [5]:
gasoline_proc.dropna(subset=['grade'], inplace=True)

In [6]:
gasoline_proc['datesampled_month'] = gasoline_proc.datesampled.dt.month
gasoline_proc['datesampled_day'] = gasoline_proc['datesampled'].dt.day
gasoline_proc['datesampled_month_day'] = gasoline_proc['datesampled_month'].astype('str') + '/' + gasoline_proc['datesampled_day'].astype('str')

In [7]:
gasoline_proc.rename(columns={'datesampled_month_day':'Date'}, inplace=True)

In [8]:
gasoline_proc = gasoline_proc.merge(astm, 
               how='left', 
                on='Date'
               )

In [9]:
gasoline_proc.drop(columns=['zipcode'], inplace=True)

In [10]:
gasoline_proc.dropna(inplace=True)

In [11]:
gasoline_proc.reset_index(inplace=True, drop=True)

### Models - Compliance Distilation 50% 

**Possible Predictor Variables**

1. TN_retailers_seasons
1. TN_distributor_seasons
1. grade
1. supplier

**Possible Target Variable**
1. compliance_dist_50
1. compliance_vap_pressure
1. compliance_vap_liq_pressure

#### **Compliance vapor pressure ~ TN Retailers & Grade*

In [20]:
# construct features 
x_feats = ['TN_distributor_seasons', 'grade']
X = pd.get_dummies(gasoline_proc[x_feats], dtype=float)
X = sm.tools.add_constant(X)
# convert target using get_dummies
y = pd.get_dummies(gasoline_proc["compliance_vap_liq_pressure"], dtype=float)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y.iloc[:,1], test_size=0.3, random_state=0)

In [22]:
print("Number transactions X_train dataset: ", X_train.shape)
print("Number transactions y_train dataset: ", y_train.shape)
print("Number transactions X_test dataset: ", X_test.shape)
print("Number transactions y_test dataset: ", y_test.shape)

Number transactions X_train dataset:  (9081, 12)
Number transactions y_train dataset:  (9081,)
Number transactions X_test dataset:  (3893, 12)
Number transactions y_test dataset:  (3893,)


In [23]:
print("Before OverSampling, counts of label '1': {}".format(sum(y_train==1)))
print("Before OverSampling, counts of label '0': {} \n".format(sum(y_train==0)))

Before OverSampling, counts of label '1': 9055
Before OverSampling, counts of label '0': 26 



In [24]:
smote = SMOTE()

# simple resampling from your previously split data
X_train_resampled, y_train_resampled = smote.fit_sample(X_train, y_train.ravel())

In [25]:
print("After OverSampling, counts of label '1': {}".format(sum(y_train_resampled==1)))
print("After OverSampling, counts of label '0': {} \n".format(sum(y_train_resampled==0)))

After OverSampling, counts of label '1': 9055
After OverSampling, counts of label '0': 9055 



In [26]:
# Your code here
import statsmodels.api as sm
logit_model = sm.Logit(y_train_resampled, X_train_resampled)
result = logit_model.fit()

         Current function value: 0.393109
         Iterations: 35




In [27]:
result.summary()

  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


0,1,2,3
Dep. Variable:,y,No. Observations:,18110.0
Model:,Logit,Df Residuals:,18100.0
Method:,MLE,Df Model:,9.0
Date:,"Tue, 30 Jul 2019",Pseudo R-squ.:,0.4329
Time:,15:21:16,Log-Likelihood:,-7119.2
converged:,False,LL-Null:,-12553.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,19.2174,,,,,
x1,10.4383,9.53e+05,1.1e-05,1.000,-1.87e+06,1.87e+06
x2,8.7835,9.29e+05,9.45e-06,1.000,-1.82e+06,1.82e+06
x3,-19.8320,9.24e+05,-2.15e-05,1.000,-1.81e+06,1.81e+06
x4,13.9929,1.03e+06,1.36e-05,1.000,-2.02e+06,2.02e+06
x5,-20.3397,8.97e+05,-2.27e-05,1.000,-1.76e+06,1.76e+06
x6,10.1273,9.27e+05,1.09e-05,1.000,-1.82e+06,1.82e+06
x7,16.0472,2.21e+06,7.27e-06,1.000,-4.32e+06,4.32e+06
x8,1.2476,1.07e+06,1.17e-06,1.000,-2.09e+06,2.09e+06


#### **Supplier TN Distributor Seasons vs. compliance vap pressure**

<font color='red'>WARNING: SINGULAR MATRIX</font>

# construct features 
x_feats = ['TN_distributor_seasons', 'grade']
X = pd.get_dummies(gasoline_proc[x_feats], dtype=float)
# convert target using get_dummies
y = pd.get_dummies(gasoline_proc["compliance_vap_pressure"], dtype=float)

```python
# construct features 
x_feats = ['TN_distributor_seasons', 'supplier']
X = pd.get_dummies(gasoline_proc[x_feats], dtype=float)
# convert target using get_dummies
y = pd.get_dummies(gasoline_proc["compliance_vap_pressure"], dtype=float)
```

```python
X = sm.add_constant(X)
# fit model
logit_model = sm.Logit(y.iloc[:,0], X)
# get results of the fit
result = logit_model.fit()
```

#### **Grade/ TN Retailer Seasons vs. compliance vap pressure**

# construct features 
x_feats = ['TN_retailers_seasons', 'grade']
X = pd.get_dummies(gasoline_proc[x_feats], dtype=float)
# convert target using get_dummies
y = pd.get_dummies(gasoline_proc["compliance_vap_pressure"], dtype=float)

X = sm.add_constant(X)
# fit model
logit_model = sm.Logit(y.iloc[:,0], X)
# get results of the fit
result = logit_model.fit()

result.summary()

#### **Grade TN Distributor Seasons vs. compliance vap-liq pressure**

# construct features 
x_feats = ['TN_distributor_seasons', 'grade']
X = pd.get_dummies(gasoline_proc[x_feats], dtype=float)
# convert target using get_dummies
y = pd.get_dummies(gasoline_proc["compliance_vap_liq_pressure"], dtype=float)

X = sm.add_constant(X)
# fit model
logit_model = sm.Logit(y.iloc[:,0], X)
# get results of the fit
result = logit_model.fit()

result.summary()