In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif, chi2, RFE, SelectFromModel, SequentialFeatureSelector
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn import tree
from sklearn import ensemble
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
import ada

In [2]:
model_to_use = "svm"
def model_creator(model_to_use):
    if model_to_use == "cart":
        return tree.DecisionTreeClassifier()
    elif model_to_use == "random_forest":
        return ensemble.RandomForestClassifier()
    elif model_to_use == "xgboost":
        return ensemble.GradientBoostingClassifier()
    elif model_to_use == "nonlinearSVM":
        return svm.SVC(kernel="rbf")
    else:
        return svm.SVC(kernel= 'linear', random_state=42)

In [3]:
base = pd.read_csv("titanic/titanic.csv", index_col=0)
base = pd.get_dummies(base)
keys = base.keys()
base = base.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(base)
base = imp.transform(base)
base = pd.DataFrame(base, columns=keys)
base

Unnamed: 0,class,"v1_Abbing, Mr. Anthony","v1_Abbott, Mr. Rossmore Edward","v1_Abbott, Mrs. Stanton (Rosa Hunt)","v1_Abelson, Mr. Samuel","v1_Abelson, Mrs. Samuel (Hannah Wizosky)","v1_Adahl, Mr. Mauritz Nils Martin","v1_Adams, Mr. John","v1_Ahlin, Mrs. Johan (Johanna Persdotter Larsson)","v1_Aks, Mrs. Sam (Leah Rosen)",...,"v1_Yrois, Miss. Henriette (""Mrs Harbeck"")","v1_Zabour, Miss. Hileni","v1_Zabour, Miss. Thamine","v1_Zimmerman, Mr. Leo","v1_de Messemaeker, Mrs. Guillaume Joseph (Emma)","v1_de Mulder, Mr. Theodore","v1_de Pelsmaeker, Mr. Alfons","v1_del Carlo, Mr. Sebastiano","v1_van Billiard, Mr. Austin Blyler","v1_van Melkebeke, Mr. Philemon"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
887,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
888,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
889,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(base.drop(columns=['class']), base["class"], test_size=0.3, random_state=42)

In [5]:
%%time
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 1.000
Accuracy test: 0.586
Wall time: 377 ms


In [6]:
%%time
connections = pd.read_csv("titanic/connections.csv")
b = pd.read_csv("titanic/titanic.csv")
for i in range(len(connections)):
    b = b.merge(pd.read_csv(f"titanic/{connections['to_table'][i]}"), left_on=connections["from_key"][i], right_on=connections["to_key"][i], suffixes=('', 'y'))
b = b[b.columns.drop(list(b.filter(regex='_id')))]
b = pd.get_dummies(b)
keys = b.keys()
b = b.apply(pd.to_numeric, errors='coerce')
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp.fit(b)
b = imp.transform(b)
b = pd.DataFrame(b, columns=keys)
X_train, X_test, y_train, y_test = train_test_split(b.drop(columns=['class']), b["class"], test_size=0.3, random_state=42)

Wall time: 205 ms


In [7]:
%%time
clf = model_creator(model_to_use)
clf.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, clf.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, clf.predict(X_test)))
X_train

Accuracy train: 1.000
Accuracy test: 0.784
Wall time: 393 ms


Unnamed: 0,v4,v5,v6,v8,v9,"v1_Allen, Miss. Elisabeth Walton","v1_Allison, Master. Hudson Trevor","v1_Allison, Miss. Helen Loraine","v1_Allison, Mrs. Hudson J C (Bessie Waldo Daniels)","v1_Anderson, Mr. Harry",...,v11_E8,v11_F E69,v11_F G63,v11_F G73,v11_F2,v11_F33,v11_F38,v11_F4,v11_G6,v11_T
15,24.000000,0.0,0.0,8.0500,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,28.000000,0.0,0.0,35.5000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
234,36.000000,0.0,2.0,71.0000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
272,21.000000,0.0,0.0,77.9583,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
93,58.000000,0.0,0.0,146.5208,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
188,23.000000,1.0,0.0,113.2750,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
71,33.750547,1.0,1.0,22.3583,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
106,44.000000,2.0,0.0,90.0000,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
270,43.000000,0.0,1.0,211.3375,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
%%time
X_new = SelectKBest(f_classif, k=30).fit_transform(b.drop(columns=['class']), b["class"])
X_train, X_test, y_train, y_test = train_test_split(X_new, b["class"], test_size=0.3, random_state=42)
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 0.830
Accuracy test: 0.755
Wall time: 57.1 ms


In [9]:
np.seterr(divide='ignore', invalid='ignore')
base = pd.read_csv("titanic/titanic.csv", index_col=0)
connections = pd.read_csv("titanic/connections.csv")
tables = []
for i in range(len(connections)):
    tables.append(pd.read_csv(f"titanic/{connections['to_table'][i]}").fillna(method='backfill', axis=1))
distribution = base.groupby("class").count()["v1"].sort_values(ascending=False)
proportion = distribution.iloc[0] / (distribution.iloc[0] + distribution.iloc[1])

In [10]:
%%time
table = ada.ada(base, tables, connections, proportion)
base = pd.get_dummies(table)
base = base.fillna(method='backfill', axis=1)
X_train, X_test, y_train, y_test = train_test_split(base.drop(columns=["class"]), base["class"], test_size=0.3, random_state=42)
model = model_creator(model_to_use)
model.fit(X_train, y_train)
print('Accuracy train: %.3f' % accuracy_score(y_train, model.predict(X_train)))
print('Accuracy test: %.3f' % accuracy_score(y_test, model.predict(X_test)))

Accuracy train: 0.996
Accuracy test: 0.740
Wall time: 1.65 s
