In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import OneClassSVM
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from scipy.stats import uniform
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler


import matplotlib.pyplot as plt

In [2]:
class dataset_builder():
    def __init__(self, train_mode=True) -> None:
        if train_mode:
            data = pd.read_csv("../input/titanic/train.csv")
            _, labels, feats  = self.preprocess(data, train_mode=True)
            self.X_train, self.X_valid, self.y_train, self.y_valid = train_test_split(feats, labels, test_size=0.2, random_state=42, stratify=labels, shuffle=True)
            
        else:
            data = pd.read_csv("../input/titanic/test.csv")
            self.ids, self.X_test  = self.preprocess(data, train_mode=False)
        
    def preprocess(self, data, train_mode=True):
        data.fillna("missing", inplace=True)
        data["cabin_deck"] = data["Cabin"].apply(lambda x: x[0] if x != "missing" else "missing")
        data["Sex"].replace({"missing": 0, "male": 1, "female": 2}, inplace=True)
        data["Embarked"].replace({"missing": 0, "S": 1, "C": 2, "Q": 3}, inplace=True)
        data["Age"].replace({"missing": 0}, inplace=True)
        data["Fare"].replace({"missing": 0}, inplace=True)
        data["cabin_deck"].replace({"missing": 0,
                                     "A": 1,
                                     "B": 2,
                                     "C": 3,
                                     "D": 4,
                                     "E": 5,
                                     "F": 6,
                                     "G": 7,
                                     "T": 8}, inplace=True)
        data.drop(["Ticket", "Name", "Cabin"], axis=1, inplace=True)
        
        ids = data.pop("PassengerId").to_numpy()
        # self.dataframe = train
        if train_mode:
            labels = data.pop("Survived").to_numpy()
            features = data.astype(np.float32).to_numpy()
            return ids, labels, features
        else:
            #labels = np.zeros(self.ids.shape)
            features = data.astype(np.float32).to_numpy()
            return ids, features
        
        

In [3]:
dataset = dataset_builder()
X_train = dataset.X_train
X_valid = dataset.X_valid
y_train = np.where(dataset.y_train, 1.0, -1.0)
y_valid = np.where(dataset.y_valid, 1.0, -1.0)

pipe = make_pipeline(StandardScaler(), OneClassSVM())
print(pipe)
distributions = {
    'oneclasssvm__nu':uniform(loc=0.01, scale=0.9),
    'oneclasssvm__kernel':['rbf', 'sigmoid'],
    'oneclasssvm__gamma':['scale', 'auto']
}
clf = RandomizedSearchCV(pipe, distributions, random_state=42, scoring='f1', n_jobs=-1, cv=5, return_train_score=True)
search = clf.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('oneclasssvm', OneClassSVM())])


In [4]:
pd.DataFrame(search.cv_results_).sort_values('mean_test_score', ascending=False).head(5)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_oneclasssvm__gamma,param_oneclasssvm__kernel,param_oneclasssvm__nu,params,split0_test_score,split1_test_score,...,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,0.015666,0.000532,0.003841,0.000326,scale,sigmoid,0.150395,"{'oneclasssvm__gamma': 'scale', 'oneclasssvm__...",0.595238,0.511364,...,0.535198,0.036694,1,0.533524,0.545455,0.540311,0.527817,0.545194,0.53846,0.00686
7,0.017149,0.000555,0.004157,0.000232,auto,sigmoid,0.175064,"{'oneclasssvm__gamma': 'auto', 'oneclasssvm__k...",0.564417,0.502924,...,0.535106,0.024556,2,0.537226,0.539683,0.542569,0.534884,0.536657,0.538204,0.002669
6,0.019493,0.001876,0.004369,0.000174,auto,sigmoid,0.201105,"{'oneclasssvm__gamma': 'auto', 'oneclasssvm__k...",0.55,0.503067,...,0.527719,0.028364,3,0.528358,0.539823,0.537666,0.528975,0.531343,0.533233,0.004659
8,0.032528,0.000354,0.00669,0.000345,auto,sigmoid,0.482281,"{'oneclasssvm__gamma': 'auto', 'oneclasssvm__k...",0.48062,0.578512,...,0.47202,0.059116,4,0.457031,0.458252,0.468085,0.494163,0.459725,0.467451,0.013906
9,0.015126,0.000586,0.004869,0.000665,auto,rbf,0.272106,"{'oneclasssvm__gamma': 'auto', 'oneclasssvm__k...",0.510638,0.44586,...,0.452468,0.035151,5,0.440571,0.479751,0.455975,0.462992,0.473016,0.462461,0.013652


In [5]:
dataset = dataset_builder(train_mode=False)
X_test = dataset.X_test
ids = dataset.ids

In [6]:
pred = search.best_estimator_.predict(X_test)

In [7]:
out_df = pd.DataFrame()
out_df['PassengerId'] = ids
out_df['Survived'] = pred

out_df.to_csv('submission.csv', index=None)