In [None]:
# Import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier

# Load the datasets

In [None]:
DATA_PATH = '../input/titanic'

data = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col=0)

data.head()

In [None]:
data.info()

- For the `Cabin` feature with too many missing values, it may be considered to delete it

- The `Name` and `Ticket` features are somewhat difficult to process, for simplicity you can delete them first.

- We can combine the `SibSp` and `Parch` features into a new feature called `Family` to prevent strong linear relationships between features.

# Process the data

In [None]:
# Drop features
useless_features = ['Name', 'Ticket', 'Cabin']
data.drop(useless_features, axis=1, inplace=True)

# Combine features
data['Family'] = data['SibSp'] + data['Parch'] + 1
data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

# Discretization(离散化)
data['Is_alone'] = (data['Family'] == 1).astype(int)

# Fillna
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Embarked'].fillna('S', inplace=True)

# Map to visualize the distribution of features
data['Sex'] = data['Sex'].map({'male': 1, 'female': 0})
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2})

data.head()

In [None]:
data.hist(bins=50, figsize=(20, 15))
plt.show()

In [None]:
# Look at the correlation coefficient
data.corr()['Survived'].sort_values(ascending=False)

In [None]:
# Discretization
qcut = 2
data['Age_stage'] = pd.qcut(data['Age'], qcut, labels=range(qcut)).astype(int)
data.corr()['Survived'].sort_values(ascending=False)

In [None]:
# Discretization
qcut = 9
data['Fare_band'] = pd.qcut(data['Fare'], qcut, labels=range(qcut)).astype(int)
data.corr()['Survived'].sort_values(ascending=False)

In [None]:
# Revert to category
cat_features = ['Fare_band', 'Embarked', 'Is_alone', 'Pclass', 'Sex']
data[cat_features] = data[cat_features].astype(str)

In [None]:
# Choose the features to use
chosen_features = ['Fare_band', 'Embarked', 'Is_alone', 'Pclass', 'Sex']

# one-hot
dummy_data = pd.get_dummies(data[chosen_features])
dummy_data.head()

In [None]:
dummy_data['Survived'] = data['Survived']
dummy_data.corr()['Survived'].sort_values(ascending=False)

# Encapsulated as a function

In [None]:
def preprocess_data(data):
    # Modify to category
    data['Pclass'] = data['Pclass'].astype(str)
    
    # Fillna
    numerical_cols = data.select_dtypes(include=['int64', 'float64']).columns
    data[numerical_cols].fillna(data[numerical_cols].mean(), inplace=True)
    data['Embarked'].fillna('S', inplace=True)
    
    # Drop features
    useless_features = ['Name', 'Ticket', 'Cabin']
    data.drop(useless_features, axis=1, inplace=True)

    # Combine features
    data['Family'] = data['SibSp'] + data['Parch'] + 1
    data.drop(['SibSp', 'Parch'], axis=1, inplace=True)

    # Discretization
    data['Is_alone'] = (data['Family'] == 1).astype(int).astype(str)
    qcut = 9
    data['Fare_band'] = pd.qcut(data['Fare'], qcut, labels=range(qcut))
    
    # one-hot
    chosen_features = ['Fare_band', 'Embarked', 'Is_alone', 'Pclass', 'Sex']
    if 'Survived' in data.columns:
        chosen_features.append('Survived')
    dummy_data = pd.get_dummies(data[chosen_features])
    
    return dummy_data

In [None]:
train_df = preprocess_data(pd.read_csv(os.path.join(DATA_PATH, 'train.csv'), index_col=0))
test_df = preprocess_data(pd.read_csv(os.path.join(DATA_PATH, 'test.csv'), index_col=0))

y = train_df['Survived'].values
X = train_df.drop('Survived', axis=1).values
X_test = test_df.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

---

# SVC model

In [None]:
def val_score(estimator, X, y):
    '''Evaluate the model's performance on the validation set
    '''
    y_pred = estimator.predict(X)
    print('model:', estimator)
    print('accuracy:', accuracy_score(y, y_pred))
    print('f1:', f1_score(y, y_pred))

In [None]:
params = np.logspace(-5, 1, 100)

train_acc = []
val_acc = []

for param in params:
    clf = SVC(C=param)
    clf.fit(X_train, y_train)
    train_acc.append(accuracy_score(y_train, clf.predict(X_train)))
    val_acc.append(accuracy_score(y_val, clf.predict(X_val)))
    
plt.plot(params, train_acc, label='Train')
plt.plot(params, val_acc, label='Val')
plt.legend()
plt.show()

In [None]:
svm_clf = SVC(C=params[val_acc.index(max(val_acc))])
svm_clf.fit(X_train, y_train)

val_score(svm_clf, X_val, y_val)

---

SVC(C=4.328761281083062)$\longrightarrow$0.77990

In [None]:
submission = pd.DataFrame(svm_clf.predict(X_test), index=test_df.index, columns=['Survived'])
submission

In [None]:
submission.to_csv('submission.csv')