In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.outliers_influence \
import variance_inflation_factor as VIF
from statsmodels.stats.anova import anova_lm
from ISLP.models import (ModelSpec as MS,
summarize,
poly)
from ISLP import confusion_table
from sklearn.metrics import matthews_corrcoef


# load data
train_df = pd.read_csv("train.csv")

In [2]:
train_df.columns

Index(['ID', 'mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'year', 'origin', 'name', 'mpg01'],
      dtype='object')

In [3]:
#select variables/features for model
#only using 2 variables, you can use more if you want
feature_columns = ['horsepower', 'weight']

#create model matrix with features
model_matrix = MS(feature_columns)

#fit transform the training df
X_train = model_matrix.fit_transform(train_df)

#set the output target (predicting mpg)
target_column = 'mpg01'
y_train = train_df[target_column]

#define the model, train and target
model1 = sm.GLM(y_train,X_train, family=sm.families.Binomial())

#fit the data to the model defn
fitted_model1 = model1.fit()

#print results
summarize(fitted_model1)

Unnamed: 0,coef,std err,z,P>|z|
intercept,13.3851,1.449,9.238,0.0
horsepower,-0.0415,0.013,-3.285,0.001
weight,-0.0033,0.0,-6.994,0.0


In [4]:
#use trained model to predict
y_train_pred = fitted_model1.predict(X_train)

#if probably is >0.5 then true (high mpg)
train_df["mpg01_pred"] = y_train_pred > 0.5

#show confusion matrix
confusion_table(train_df["mpg01"], train_df["mpg01_pred"])

Truth,False,True
Predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
False,173,24
True,23,177


In [5]:
mcc = matthews_corrcoef(train_df["mpg01"], train_df["mpg01_pred"])
mcc

0.7632113315077836

In [6]:
#Now use the model for testing data for competition

#load the data with no mpg
test_df = pd.read_csv("test.csv")

#transport the input using model matrix
X_test = model_matrix.transform(test_df)

#use fitted model to predict mpg high or low
y_test_pred = fitted_model1.predict(X_test)

In [7]:
#use fitted model to predict mpg high or low
y_test_pred = fitted_model1.predict(X_test)
test_df["mpg01"] = y_test_pred > 0.5

In [8]:
#export csv for kaggle
test_df[["ID","mpg01"]].to_csv('predicted.csv',index=False)