In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
X = pd.read_csv("../input/titanic/train.csv")
X.head()

In [None]:
#Create target dataset
y = X.pop('Survived')
y.head()

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.datasets import fetch_openml
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.base import TransformerMixin, BaseEstimator 
from sklearn.model_selection import RandomizedSearchCV

In [None]:
class CategoricalImputer(TransformerMixin, BaseEstimator):

    def __init__(self):
        """Impute missing values.

        Columns of dtype object are imputed with the most frequent value 
        in column.

        Columns of other types are imputed with mean of column.

        """
    def fit(self, X, y=None):

        self.fill = pd.Series([X[c].value_counts().index[0] for c in X], index=X.columns)
        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [None]:
numeric_features = ['Age', 'Fare']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['Embarked', 'Sex', 'Pclass']
categorical_transformer = Pipeline(steps=[
    ('imputer', CategoricalImputer()),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

clf = DecisionTreeClassifier()

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
pipe = Pipeline([('preprocessor', preprocessor),
                 ('classifier', clf)])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

param_grid = {'classifier__max_depth': range(1, 50),
             'classifier__min_samples_split': range(2, 100),
             'classifier__min_samples_leaf': range(1,100),
             'classifier__max_leaf_nodes': range(1, 100)}

pipe.get_params().keys()

random_search = RandomizedSearchCV(pipe, param_grid, cv=10, n_jobs=2, n_iter=1000, verbose=1, 
                                   random_state=0)

random_search.fit(X_train, y_train)

print(random_search.best_score_)
#0.814612676056338

random_search.score(X_test, y_test)
#0.8268156424581006

In [None]:
test=pd.read_csv("../input/titanic/test.csv")

output = pd.DataFrame({'PassengerId': test['PassengerId'], 'Survived': random_search.predict(test)})
output.to_csv('my_submission2.csv', index=False)