In [2]:
%matplotlib inline

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV

In [3]:
# Load data
data = pd.read_csv("./train.csv", true_values=["True"], false_values=["False"])
# data_test = pd.read_csv("./test.csv", true_values=["True"], false_values=["False"])

In [8]:
data.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [5]:
def transform_data(df):

    #Function that fills the nans with 0s. Better way is to fill with the mean of the specific column for the numerical ones
    attributes = df.drop(columns = ["PassengerId", "Name", "Cabin"]) 
    attributes["RoomService"] = attributes["RoomService"].fillna(0)
    attributes["FoodCourt"] = attributes["FoodCourt"].fillna(0)
    attributes["ShoppingMall"] = attributes["ShoppingMall"].fillna(0)
    attributes["Spa"] = attributes["Spa"].fillna(0)
    attributes["VRDeck"] = attributes["VRDeck"].fillna(0)


    attributes["VIP"] = attributes["ShoppingMall"].fillna(False)
    attributes["CryoSleep"] = attributes["CryoSleep"].fillna(False)

    # Fill the nan with the mean of "Age"
    attributes["Age"] = attributes["Age"].fillna(np.nanmean(attributes.Age.to_numpy(np.float32)))

    # Drop remaining missing values which are in "Destination" and "HomePlanet"
    attributes = attributes.dropna()

    # Create labels
    labels = attributes["Transported"]
    
    # One-hot-encode the data
    attributes = attributes.drop(columns=["Transported"])
    attributes = pd.get_dummies(attributes, drop_first=True)

    return attributes, labels


In [7]:
# Call the function
attributes, labels = transform_data(data)

In [None]:
# Scale data
scaler = MinMaxScaler()
scaled_attributes = scaler.fit_transform(attributes)

In [None]:
# Set model and run a grid search
model = LogisticRegression()
tuned_params = [{"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000],
                 "penalty": ["l1", "l2"],
                 "solver": ["lbfgs", "liblinear"],
                 "class_weight": [None, "balanced"]}]

grid = GridSearchCV(model, tuned_params)
grid.fit(attributes, labels)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [None]:
print(grid.best_params_)
print(grid.best_score_)

{'C': 0.01, 'class_weight': None, 'penalty': 'l2', 'solver': 'lbfgs'}
0.7950456346923522


In [None]:
# print(classification_report(labels, model.predict(attributes)))

              precision    recall  f1-score   support

       False       0.53      0.85      0.65      4129
        True       0.64      0.27      0.38      4185

    accuracy                           0.55      8314
   macro avg       0.59      0.56      0.51      8314
weighted avg       0.59      0.55      0.51      8314





In [None]:
# list(zip(attributes.columns.to_list(), model.coef_[0,:]))

[('CryoSleep', 1.7693150420678614),
 ('Age', -0.9719855112149873),
 ('VIP', 3.473487261108779),
 ('RoomService', -8.27199363975797),
 ('FoodCourt', 4.706903678264782),
 ('ShoppingMall', 3.473487261108779),
 ('Spa', -11.45062202855383),
 ('VRDeck', -10.796751937272907),
 ('HomePlanet_Europa', 1.4169590768328153),
 ('HomePlanet_Mars', 0.5158930950537393),
 ('Destination_PSO J318.5-22', -0.35015119064645467),
 ('Destination_TRAPPIST-1e', -0.3516048300271149)]