In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from matplotlib import pyplot as plt

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import (RandomForestClassifier)
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score

# /kaggle/input/titanic/train.csv
# /kaggle/input/titanic/test.csv
# /kaggle/input/titanic/gender_submission.csv
        
train_data = pd.read_csv("/kaggle/input/titanic/train.csv")
test_data = pd.read_csv("/kaggle/input/titanic/test.csv")

train_data = train_data.drop(["Ticket", "Cabin","Name", "PassengerId"], axis=1)
test_data = test_data.drop(["Ticket", "Cabin", "Name"], axis=1)
combined = [train_data, test_data]

guess_ages = np.zeros((2,3))
for dataset in combined:
    dataset["Sex"] = dataset["Sex"].map({ "female": 1, "male": 0 }).astype(int)
    for i in range(0, 2):
        for j in range(0, 3):
            guess_df = dataset[(dataset['Sex'] == i) & \
                                  (dataset['Pclass'] == j+1)]['Age'].dropna()

            age_guess = guess_df.median()

            # Convert random age float to nearest .5 age
            guess_ages[i,j] = int( age_guess/0.5 + 0.5 ) * 0.5
            
    for i in range(0, 2):
        for j in range(0, 3):
            dataset.loc[ (dataset.Age.isnull()) & (dataset.Sex == i) & (dataset.Pclass == j+1),\
                    'Age'] = guess_ages[i,j]

    dataset['Age'] = dataset['Age'].astype(int)
    
combined = [train_data, test_data]
# Most frequent port, according to mode of the column.
most_frequent_port = train_data["Embarked"].dropna().mode()[0]
test_data["Fare"].fillna(test_data["Fare"].dropna().median(), inplace=True)
train_data["FareBand"] = pd.qcut(train_data["Fare"], 4)
train_data[["FareBand", "Survived"]].groupby(["FareBand"], as_index=False).mean().sort_values(by="FareBand", ascending=True)

for dataset in combined:
    # Set fares according to the fare band.
    dataset.loc[ dataset['Fare'] <= 7.91, 'Fare'] = 0
    dataset.loc[(dataset['Fare'] > 7.91) & (dataset['Fare'] <= 14.454), 'Fare'] = 1
    dataset.loc[(dataset['Fare'] > 14.454) & (dataset['Fare'] <= 31), 'Fare']   = 2
    dataset.loc[ dataset['Fare'] > 31, 'Fare'] = 3
    dataset['Fare'] = dataset['Fare'].astype(int)
    # Set ages to according to the ageband.
    dataset.loc[ dataset['Age'] <= 16, 'Age'] = 0
    dataset.loc[(dataset['Age'] > 16) & (dataset['Age'] <= 32), 'Age'] = 1
    dataset.loc[(dataset['Age'] > 32) & (dataset['Age'] <= 48), 'Age'] = 2
    dataset.loc[(dataset['Age'] > 48) & (dataset['Age'] <= 64), 'Age'] = 3
    dataset.loc[ dataset['Age'] > 64, 'Age'] = 4
    # Fill embarked rows with NaN with the mode of the whole columna and one-hot encoding to S=0, C=1, Q=2.
    dataset["Embarked"] = dataset["Embarked"].fillna(most_frequent_port)
    dataset['Embarked'] = dataset['Embarked'].map( {"S": 0, "C": 1, "Q": 2} ).astype(int)
    # Set family size to siblings + parch + the person itself. IsAlone is 1 only and only if there is more than 1 people in the family.
    dataset["FamilySize"] = dataset["SibSp"] + dataset["Parch"] + 1
    dataset["IsAlone"] = 0
    dataset.loc[dataset["FamilySize"] == 1, "IsAlone"] = 1
    
# Drop parch and fareband.
train_data = train_data.drop(["FareBand", "Parch"], axis=1)
test_data = test_data.drop(["Parch"], axis=1)
combined = [train_data, test_data]

# Here starts the comparison of algorithms and fitting.
train_x = combined[0].drop("Survived", axis=1)
train_y = train_data["Survived"]
test_x = combined[1]
x_dev, x_eval, y_dev, y_eval = train_test_split(train_x, train_y, test_size=0.2, random_state=42)

paramgrid = {
    "n_estimators":      [100],
    'criterion':         ['gini', 'entropy'],
    'max_features':      ['auto', 'log2'],
    'min_samples_leaf':  list(range(2, 8))
}

gs = GridSearchCV(RandomForestClassifier(random_state=77), paramgrid, cv=4)
gs.fit(x_dev, y_dev)

# Get the best params
clf = gs.best_estimator_
params = gs.best_params_
score = gs.best_score_

clf.fit(train_x, train_y)
passIds = test_x["PassengerId"]
test_x = test_x.drop(["PassengerId"], axis=1)
pred_y = clf.predict(test_x).astype(int)

output = pd.DataFrame({ "PassengerId": combined[1]["PassengerId"], "Survived": pred_y })
output.to_csv("submission.csv", index=False)
print("Success!")

Success!
