In [90]:
import pandas as pd
import sklearn
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

TRAIN_FILE_PATH = 'train.csv'
TEST_FILE_PATH = 'test.csv'

# Load train and test data
train_data = pd.read_csv(TRAIN_FILE_PATH, index_col='PassengerId')
test_data = pd.read_csv(TEST_FILE_PATH, index_col='PassengerId')

In [91]:
train_data

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.2500,S,1
2,1,1,female,38.0,1,0,71.2833,C,1
3,1,3,female,26.0,0,0,7.9250,S,0
4,1,1,female,35.0,1,0,53.1000,S,1
5,0,3,male,35.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
887,0,2,male,27.0,0,0,13.0000,S,0
888,1,1,female,19.0,0,0,30.0000,S,0
889,0,3,female,28.0,1,2,23.4500,S,3
890,1,1,male,26.0,0,0,30.0000,C,0


In [92]:
train_data.query("Sex == 'male' & Embarked == 'S'")

#train_data[(train_data['Sex'] == 'male') & (train_data['Embarked'] == 'S')]#

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Family_size
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,0,3,male,22.0,1,0,7.2500,S,1
5,0,3,male,35.0,0,0,8.0500,S,0
7,0,1,male,54.0,0,0,51.8625,S,0
8,0,3,male,2.0,3,1,21.0750,S,4
13,0,3,male,20.0,0,0,8.0500,S,0
...,...,...,...,...,...,...,...,...,...
879,0,3,male,28.0,0,0,7.8958,S,0
882,0,3,male,33.0,0,0,7.8958,S,0
884,0,2,male,28.0,0,0,10.5000,S,0
885,0,3,male,25.0,0,0,7.0500,S,0


In [93]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Survived     891 non-null    int64  
 1   Pclass       891 non-null    int64  
 2   Sex          891 non-null    object 
 3   Age          891 non-null    float64
 4   SibSp        891 non-null    int64  
 5   Parch        891 non-null    int64  
 6   Fare         891 non-null    float64
 7   Embarked     891 non-null    object 
 8   Family_size  891 non-null    int64  
dtypes: float64(2), int64(5), object(2)
memory usage: 69.6+ KB


In [94]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       418 non-null    int64  
 1   Sex          418 non-null    object 
 2   Age          418 non-null    float64
 3   SibSp        418 non-null    int64  
 4   Parch        418 non-null    int64  
 5   Fare         418 non-null    float64
 6   Embarked     418 non-null    object 
 7   Family_size  418 non-null    int64  
dtypes: float64(2), int64(4), object(2)
memory usage: 29.4+ KB


In [95]:
# Impute missing data
def impute_data(df):
    df_copy = df.copy() # Make a copy to avoid modifying the original data
    
    # Fill missing values in categorical variables with the most frequent value in the column
    categorical_vars = list(df_copy.select_dtypes(include=['object']).columns.values)
    for col in categorical_vars:
        if df_copy[col].isnull().values.any():
            df_copy[col].fillna(df_copy[col].mode()[0], inplace=True)
    
    # Fill missing values in numerical variables with the median value in the column
    numerical_vars = list(df_copy.select_dtypes(include=['int64', 'float64']).columns.values)
    for col in numerical_vars:
        if df_copy[col].isnull().values.any():
            df_copy[col].fillna(df_copy[col].median(), inplace=True)
            
    return df_copy

In [96]:
# Separate the 'Survived' column from the train dataset
y_train = train_data['Survived']
train_data_sep = train_data.drop(columns=['Survived'])

In [97]:
y_train

PassengerId
1      0
2      1
3      1
4      1
5      0
      ..
887    0
888    1
889    0
890    1
891    0
Name: Survived, Length: 891, dtype: int64

In [98]:
# Perform one-hot encoding of categorical variables
def encode_categorical(df_train, df_test):
    test_start_ind = df_train.index[-1] # Remember the index where the test data starts
    
    concated_df = pd.concat([df_train, df_test], sort=False) # Combine train and test data for consistency
        
    encoded_df = pd.get_dummies(concated_df, drop_first=True) # Perform one-hot encoding
    
    # Split back into train and test datasets
    df_train_encoded, df_test_encoded = encoded_df.iloc[:test_start_ind, :], encoded_df.iloc[test_start_ind:, :]
    
    return df_train_encoded, df_test_encoded

In [99]:
# Encode categorical variables
X_train_encoded, X_test_encoded = encode_categorical(train_data_sep, test_data)

In [100]:
X_train_encoded

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Family_size,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1,3,22.0,1,0,7.2500,1,1,0,1
2,1,38.0,1,0,71.2833,1,0,0,0
3,3,26.0,0,0,7.9250,0,0,0,1
4,1,35.0,1,0,53.1000,1,0,0,1
5,3,35.0,0,0,8.0500,0,1,0,1
...,...,...,...,...,...,...,...,...,...
887,2,27.0,0,0,13.0000,0,1,0,1
888,1,19.0,0,0,30.0000,0,0,0,1
889,3,28.0,1,2,23.4500,3,0,0,1
890,1,26.0,0,0,30.0000,0,1,0,0


In [101]:
X_test_encoded

Unnamed: 0_level_0,Pclass,Age,SibSp,Parch,Fare,Family_size,Sex_male,Embarked_Q,Embarked_S
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
892,3,34.5,0,0,7.8292,0,1,1,0
893,3,47.0,1,0,7.0000,1,0,0,1
894,2,62.0,0,0,9.6875,0,1,1,0
895,3,27.0,0,0,8.6625,0,1,0,1
896,3,22.0,1,1,12.2875,2,0,0,1
...,...,...,...,...,...,...,...,...,...
1305,3,27.0,0,0,8.0500,0,1,0,1
1306,1,39.0,0,0,108.9000,0,0,0,0
1307,3,38.5,0,0,7.2500,0,1,0,1
1308,3,27.0,0,0,8.0500,0,1,0,1


In [102]:
X_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       891 non-null    int64  
 1   Age          891 non-null    float64
 2   SibSp        891 non-null    int64  
 3   Parch        891 non-null    int64  
 4   Fare         891 non-null    float64
 5   Family_size  891 non-null    int64  
 6   Sex_male     891 non-null    uint8  
 7   Embarked_Q   891 non-null    uint8  
 8   Embarked_S   891 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 51.3 KB


In [103]:
X_test_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 418 entries, 892 to 1309
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Pclass       418 non-null    int64  
 1   Age          418 non-null    float64
 2   SibSp        418 non-null    int64  
 3   Parch        418 non-null    int64  
 4   Fare         418 non-null    float64
 5   Family_size  418 non-null    int64  
 6   Sex_male     418 non-null    uint8  
 7   Embarked_Q   418 non-null    uint8  
 8   Embarked_S   418 non-null    uint8  
dtypes: float64(2), int64(4), uint8(3)
memory usage: 24.1 KB


In [104]:
# Decision Tree Classifier
dt_clf = DecisionTreeClassifier() # Create an instance of the classifier
dt_clf.fit(X_train_encoded, y_train) # Fit the classifier on the encoded train data
pred_dt_simple = dt_clf.predict(X_test_encoded) # Make predictions on the encoded test data


In [105]:
# Grid Search for hyperparameter tuning
clf = DecisionTreeClassifier()
parameters = {
    'criterion': ['entropy', 'gini'],
    'max_depth': range(1, 10),
    'min_samples_split': range(2, 10),
    'min_samples_leaf': range(1, 10)
}

grid_cv = GridSearchCV(clf, parameters, cv=5)
grid_cv.fit(X_train_encoded, y_train)

best_model = grid_cv.best_estimator_
print('Best classifier parameters:', grid_cv.best_params_)

grid_pred = best_model.predict(X_test_encoded)

Best classifier parameters: {'criterion': 'gini', 'max_depth': 7, 'min_samples_leaf': 6, 'min_samples_split': 3}


In [106]:
# Randomized Search for hyperparameter tuning
random_grid_cv = RandomizedSearchCV(clf, parameters, cv=5)
random_grid_cv.fit(X_train_encoded, y_train)


RandomizedSearchCV(cv=5, estimator=DecisionTreeClassifier(),
                   param_distributions={'criterion': ['entropy', 'gini'],
                                        'max_depth': range(1, 10),
                                        'min_samples_leaf': range(1, 10),
                                        'min_samples_split': range(2, 10)})

In [107]:
best_model_rand = random_grid_cv.best_estimator_
print('Best classifier parameters:', random_grid_cv.best_params_)

rand_grid_pred = best_model_rand.predict(X_test_encoded)

Best classifier parameters: {'min_samples_split': 4, 'min_samples_leaf': 5, 'max_depth': 7, 'criterion': 'gini'}


In [108]:
# Decision Tree Classifier with best parameters
dt_best = DecisionTreeClassifier(max_depth=4)
dt_best.fit(X_train_encoded, y_train)
pred_tree_best = dt_best.predict(X_test_encoded)