In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import logging
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from ml.data import basic_preprocess
from ml.model import train_model, save_model, load_model, predict_batch
from datetime import datetime
#warnings.filterwarnings('ignore')

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%Y-%m-%d %H:%M:%S")

In [2]:
# Load data
logging.info("Loading data")
df = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

2022-10-27 06:35:18 INFO Loading data


In [3]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# train model
model, dv = train_model(df, target='Transported')

2022-10-27 06:35:18 INFO Replacing " " and "-" with "_" in and  column names and converting to lower case
2022-10-27 06:35:18 INFO Converting CryoSleep to binary
2022-10-27 06:35:18 INFO Converting VIP to binary
2022-10-27 06:35:18 INFO Splitting "cabin" from `deck/num/side` to only `deck`
2022-10-27 06:35:18 INFO Found 5 categorical columns
2022-10-27 06:35:18 INFO Categorical columns: Index(['passengerid', 'homeplanet', 'cabin', 'destination', 'name'], dtype='object')
2022-10-27 06:35:18 INFO Found 9 numerical columns
2022-10-27 06:35:18 INFO Numerical columns: Index(['cryosleep', 'age', 'vip', 'roomservice', 'foodcourt', 'shoppingmall',
       'spa', 'vrdeck', 'transported'],
      dtype='object')
2022-10-27 06:35:18 INFO Converting str to lower case
2022-10-27 06:35:18 INFO Converting target to binary
2022-10-27 06:35:18 INFO Removing target from cat_cols or num_cols
2022-10-27 06:35:18 INFO Filling categorical columns with mode
2022-10-27 06:35:18 INFO Filling numerical columns wi

***** Validation metrics *****
AUC: 0.860
Precision: 0.811
Recall: 0.758
F1: 0.784
Accuracy: 0.784
******************************


In [5]:
# save model
save_model(model, dv, "./models/model.bin")



2022-10-27 06:35:19 INFO Saving model to ./models/model.bin


In [6]:
# load model
model, dv = load_model("./models/model.bin")

# predict on test data
test_predictions = predict_batch(df_test, model, dv, target='Transported')



2022-10-27 06:35:20 INFO Loading model from ./models/model.bin
2022-10-27 06:35:20 INFO Replacing " " and "-" with "_" in and  column names and converting to lower case
2022-10-27 06:35:20 INFO Converting CryoSleep to binary
2022-10-27 06:35:20 INFO Converting VIP to binary
2022-10-27 06:35:20 INFO Splitting "cabin" from `deck/num/side` to only `deck`
2022-10-27 06:35:20 INFO Found 5 categorical columns
2022-10-27 06:35:20 INFO Categorical columns: Index(['passengerid', 'homeplanet', 'cabin', 'destination', 'name'], dtype='object')
2022-10-27 06:35:20 INFO Found 8 numerical columns
2022-10-27 06:35:20 INFO Numerical columns: Index(['cryosleep', 'age', 'vip', 'roomservice', 'foodcourt', 'shoppingmall',
       'spa', 'vrdeck'],
      dtype='object')
2022-10-27 06:35:20 INFO Converting str to lower case
2022-10-27 06:35:20 INFO Removing target from cat_cols or num_cols
2022-10-27 06:35:20 INFO Filling categorical columns with mode
2022-10-27 06:35:20 INFO Filling numerical columns with me

In [7]:
test_predictions

Unnamed: 0,passengerid,homeplanet,cryosleep,cabin,destination,age,vip,roomservice,foodcourt,shoppingmall,spa,vrdeck,name,prediction,prediction_probability
0,0013_01,earth,1.0,g,trappist-1e,27.000000,0.0,0.0,0.0,0.0,0.0,0.0,nelly carsoning,True,0.775082
1,0018_01,earth,0.0,f,trappist-1e,19.000000,0.0,0.0,9.0,0.0,2823.0,0.0,lerome peckers,False,0.020000
2,0019_01,europa,1.0,c,55 cancri e,31.000000,0.0,0.0,0.0,0.0,0.0,0.0,sabih unhearfus,True,1.000000
3,0021_01,europa,0.0,c,trappist-1e,38.000000,0.0,0.0,6652.0,0.0,181.0,585.0,meratz caltilter,True,0.910000
4,0023_01,earth,0.0,f,trappist-1e,20.000000,0.0,10.0,0.0,635.0,0.0,0.0,brence harperez,False,0.430000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4272,9266_02,earth,1.0,g,trappist-1e,34.000000,0.0,0.0,0.0,0.0,0.0,0.0,jeron peter,True,0.777356
4273,9269_01,earth,0.0,f,trappist-1e,42.000000,0.0,0.0,847.0,17.0,10.0,144.0,matty scheron,False,0.230000
4274,9271_01,mars,1.0,d,55 cancri e,28.658146,0.0,0.0,0.0,0.0,0.0,0.0,jayrin pore,True,1.000000
4275,9273_01,europa,0.0,d,trappist-1e,28.658146,0.0,0.0,2680.0,0.0,0.0,523.0,kitakan conale,True,0.780000
