In [143]:
import pandas as pd
train = pd.read_csv('data/train.csv')
holdout = pd.read_csv('data/test.csv')

In [144]:
holdout.head(2)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S


In [145]:
# %load functions.py
def process_missing(df):
    """Handle various missing values from the data set

    Usage
    ------

    holdout = process_missing(holdout)
    """
    df["Fare"] = df["Fare"].fillna(df["Fare"].mean())
    df["Embarked"] = df["Embarked"].fillna("S")
    return df

def process_age(df):
    """Process the Age column into pre-defined 'bins' 

    Usage
    ------

    train = process_age(train)
    """
    df["Age"] = df["Age"].fillna(-0.5)
    cut_points = [-1,0,5,12,18,35,60,100]
    label_names = ["Missing","Infant","Child","Teenager","Young Adult","Adult","Senior"]
    df["Age_categories"] = pd.cut(df["Age"],cut_points,labels=label_names)
    return df

def process_fare(df):
    """Process the Fare column into pre-defined 'bins' 

    Usage
    ------

    train = process_fare(train)
    """
    cut_points = [-1,12,50,100,1000]
    label_names = ["0-12","12-50","50-100","100+"]
    df["Fare_categories"] = pd.cut(df["Fare"],cut_points,labels=label_names)
    return df

def process_cabin(df):
    """Process the Cabin column into pre-defined 'bins' 

    Usage
    ------

    train process_cabin(train)
    """
    df["Cabin_type"] = df["Cabin"].str[0]
    df["Cabin_type"] = df["Cabin_type"].fillna("Unknown")
    df = df.drop('Cabin',axis=1)
    return df

def process_titles(df):
    """Extract and categorize the title from the name column 

    Usage
    ------

    train = process_titles(train)
    """
    titles = {
        "Mr" :         "Mr",
        "Mme":         "Mrs",
        "Ms":          "Mrs",
        "Mrs" :        "Mrs",
        "Master" :     "Master",
        "Mlle":        "Miss",
        "Miss" :       "Miss",
        "Capt":        "Officer",
        "Col":         "Officer",
        "Major":       "Officer",
        "Dr":          "Officer",
        "Rev":         "Officer",
        "Jonkheer":    "Royalty",
        "Don":         "Royalty",
        "Sir" :        "Royalty",
        "Countess":    "Royalty",
        "Dona":        "Royalty",
        "Lady" :       "Royalty"
    }
    extracted_titles = df["Name"].str.extract(' ([A-Za-z]+)\.',expand=False)
    df["Title"] = extracted_titles.map(titles)
    return df

def create_dummies(df,column_name):
    """Create Dummy Columns (One Hot Encoding) from a single Column

    Usage
    ------

    train = create_dummies(train,"Age")
    """
    dummies = pd.get_dummies(df[column_name],prefix=column_name)
    df = pd.concat([df,dummies],axis=1)
    return df

In [146]:
from sklearn.preprocessing import minmax_scale
def process_data(df):
    missing=process_missing(df)
    age=process_age(missing)
    fare=process_fare(age)
    titles=process_titles(fare)
    temp_df = process_cabin(titles)
    dummy_cols=['Age_categories','Fare_categories','Title','Cabin_type','Sex','Embarked','Pclass']
    for i in dummy_cols:
        temp_df=create_dummies(temp_df,i)
    cols=['SibSp', 'Parch','Fare']
    for i in cols:
        temp_df[i+'_scaled']=minmax_scale(temp_df[i])
    temp_df.drop(axis=1,columns=dummy_cols,inplace=True)
    temp_df.drop(axis=1,columns=cols,inplace=True)
    extra_cols=['Name','Age','Ticket']
    temp_df.drop(axis=1,columns=extra_cols,inplace=True)
    return temp_df
    
    

In [147]:
train=process_data(train)
holdout=process_data(holdout)

In [148]:
train.head(2)

Unnamed: 0,PassengerId,Survived,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior,Fare_categories_0-12,...,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,SibSp_scaled,Parch_scaled,Fare_scaled
0,1,0,0,0,0,0,1,0,0,1,...,1,0,0,1,0,0,1,0.125,0.0,0.014151
1,2,1,0,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,0.125,0.0,0.139136


In [149]:
holdout.head(2)

Unnamed: 0,PassengerId,Age_categories_Missing,Age_categories_Infant,Age_categories_Child,Age_categories_Teenager,Age_categories_Young Adult,Age_categories_Adult,Age_categories_Senior,Fare_categories_0-12,Fare_categories_12-50,...,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Pclass_1,Pclass_2,Pclass_3,SibSp_scaled,Parch_scaled,Fare_scaled
0,892,0,0,0,0,1,0,0,1,0,...,1,0,1,0,0,0,1,0.0,0.0,0.015282
1,893,0,0,0,0,0,1,0,1,0,...,0,0,0,1,0,0,1,0.125,0.0,0.013663


In [150]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 39 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PassengerId                 891 non-null    int64  
 1   Survived                    891 non-null    int64  
 2   Age_categories_Missing      891 non-null    uint8  
 3   Age_categories_Infant       891 non-null    uint8  
 4   Age_categories_Child        891 non-null    uint8  
 5   Age_categories_Teenager     891 non-null    uint8  
 6   Age_categories_Young Adult  891 non-null    uint8  
 7   Age_categories_Adult        891 non-null    uint8  
 8   Age_categories_Senior       891 non-null    uint8  
 9   Fare_categories_0-12        891 non-null    uint8  
 10  Fare_categories_12-50       891 non-null    uint8  
 11  Fare_categories_50-100      891 non-null    uint8  
 12  Fare_categories_100+        891 non-null    uint8  
 13  Title_Master                891 non

In [152]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV 

In [153]:
def select_features(df):
    #selecting only numeric columns and dropping any columns that have nulls.
    df=df.select_dtypes(include=['int64','float64','uint8'])
    df.dropna(axis=1,inplace=True)
    all_X=df.drop(axis=1,columns=['PassengerId','Survived'])
    all_y=df['Survived']
    rf=RandomForestClassifier(random_state=1)
    selector=RFECV(rf,cv=10)
    selector.fit(all_X,all_y)
    optimized_columns=all_X.columns[selector.support_]
    print(optimized_columns)
    return optimized_columns
    

In [154]:
optimized_columns=select_features(train)
#optimized_columns
#train.info()
#train.info()

Index(['Title_Mr', 'Sex_female', 'Sex_male', 'Pclass_3', 'Fare_scaled'], dtype='object')


In [155]:
len(optimized_columns)

5

In [156]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

In [157]:
def select_model(df,features):
    all_X=df[features]
    all_y=df['Survived']
    
    models_list=[
    {
    "name": "LogisticRegression",
    "estimator": LogisticRegression(),
    "hyperparameters":
        {
            "solver": ["newton-cg", "lbfgs", "liblinear"]
        }
    },
    {
    "name": "KNeighborsClassifier",
    "estimator": KNeighborsClassifier(),
    "hyperparameters":
        {
           "n_neighbors": range(1,20,2),
            "weights": ["distance", "uniform"],
            "algorithm": ["ball_tree", "kd_tree", "brute"],
            "p": [1,2]
        }
    },
    {
    "name": "RandomForestClassifier",
    "estimator": RandomForestClassifier(),
    "hyperparameters":
        {
            "n_estimators": [4, 6, 9],
            "criterion": ["entropy", "gini"],
            "max_depth": [2, 5, 10],
            "max_features": ["log2", "sqrt"],
            "min_samples_leaf": [1, 5, 8],
            "min_samples_split": [2, 3, 5]
        }
    }
    ] 
    
    for i in models_list:
        print(i['name'])
        grid=GridSearchCV(i['estimator'],param_grid=i['hyperparameters'],cv=10)
        grid.fit(all_X,all_y)
        i['best_params']=grid.best_params_
        i['best_score']=grid.best_score_
        i['best_estimator']=grid.best_estimator_
       
    return models_list

In [158]:
models_list=select_model(train,optimized_columns)

LogisticRegression
KNeighborsClassifier
RandomForestClassifier


In [159]:
models_list

[{'name': 'LogisticRegression',
  'estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                     warm_start=False),
  'hyperparameters': {'solver': ['newton-cg', 'lbfgs', 'liblinear']},
  'best_params': {'solver': 'newton-cg'},
  'best_score': 0.7889762796504369,
  'best_estimator': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                     intercept_scaling=1, l1_ratio=None, max_iter=100,
                     multi_class='auto', n_jobs=None, penalty='l2',
                     random_state=None, solver='newton-cg', tol=0.0001, verbose=0,
                     warm_start=False)},
 {'name': 'KNeighborsClassifier',
  'estimator': KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='

In [160]:
def save_submission_file(model,cols,filename='submissions.csv'):
    predictions=model.predict(holdout[cols])
    submissions=pd.DataFrame({'PassengerID':holdout['PassengerId'],
                             'Survived':predictions})
    submissions.to_csv(filename,index=False)

In [161]:
best_model=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                         criterion='gini', max_depth=None, max_features='auto',
                         max_leaf_nodes=None, max_samples=None,
                         min_impurity_decrease=0.0, min_impurity_split=None,
                         min_samples_leaf=1, min_samples_split=2,
                         min_weight_fraction_leaf=0.0, n_estimators=100,
                         n_jobs=None, oob_score=False, random_state=None,
                         verbose=0, warm_start=False)

In [162]:
best_model.fit(train[optimized_columns],train['Survived'])
save_submission_file(best_model,optimized_columns,filename='submissions_2.csv')

In [50]:
optimized_columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'isalone'], dtype='object')

In [51]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 44 columns):
 #   Column                      Non-Null Count  Dtype   
---  ------                      --------------  -----   
 0   PassengerId                 891 non-null    int64   
 1   Survived                    891 non-null    int64   
 2   Pclass                      891 non-null    int64   
 3   Name                        891 non-null    object  
 4   Sex                         891 non-null    object  
 5   Age                         891 non-null    float64 
 6   SibSp                       891 non-null    int64   
 7   Parch                       891 non-null    int64   
 8   Ticket                      891 non-null    object  
 9   Fare                        891 non-null    float64 
 10  Embarked                    891 non-null    object  
 11  Age_categories              891 non-null    category
 12  Fare_categories             891 non-null    category
 13  Title               

In [163]:
new_optimized_columns=['Age_categories_Missing', 'Age_categories_Infant',
       'Age_categories_Child', 'Age_categories_Teenager',
       'Age_categories_Young Adult', 'Age_categories_Adult',
       'Age_categories_Senior', 'Pclass_1', 'Pclass_2', 'Pclass_3',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S',
       'SibSp_scaled', 'Parch_scaled', 'Fare_scaled', 'Fare_categories_0-12',
       'Fare_categories_12-50', 'Fare_categories_50-100',
       'Fare_categories_100+', 'Title_Master', 'Title_Miss', 'Title_Mr',
       'Title_Mrs', 'Title_Officer', 'Title_Royalty', 'Cabin_type_A',
       'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E',
       'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown']
all_X=train[new_optimized_columns]
all_y=train['Survived']

In [172]:
rf=RandomForestClassifier(random_state=1)
hyperparameters={'criterion':['entropy','gini'],
                 'max_depth':[5,10],
                 'max_features':['log2','sqrt'],
                 'min_samples_leaf':[1,5],
                 'min_samples_split':[3,5],
                 'n_estimators':[6,9]
                }

grid=GridSearchCV(rf,param_grid=hyperparameters,cv=10)
grid.fit(all_X,all_y)
best_params=grid.best_params_
best_score=grid.best_score_

best_rf=grid.best_estimator_ 
best_rf.fit(all_X,all_y)
holdout_no_id = holdout.drop(['PassengerId'],axis=1)
holdout_no_id['Cabin_type_T']=0 #Cabin_type_T was not present in holdout dataset
predictions=best_rf.predict(holdout_no_id)
submission=pd.DataFrame({'PassengerId':holdout['PassengerId'],
                         'Survived':predictions
                        })
submission.to_csv('submission_3.csv',index=False)


In [168]:
all_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 37 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Age_categories_Missing      891 non-null    uint8  
 1   Age_categories_Infant       891 non-null    uint8  
 2   Age_categories_Child        891 non-null    uint8  
 3   Age_categories_Teenager     891 non-null    uint8  
 4   Age_categories_Young Adult  891 non-null    uint8  
 5   Age_categories_Adult        891 non-null    uint8  
 6   Age_categories_Senior       891 non-null    uint8  
 7   Pclass_1                    891 non-null    uint8  
 8   Pclass_2                    891 non-null    uint8  
 9   Pclass_3                    891 non-null    uint8  
 10  Sex_female                  891 non-null    uint8  
 11  Sex_male                    891 non-null    uint8  
 12  Embarked_C                  891 non-null    uint8  
 13  Embarked_Q                  891 non

In [170]:
all_X['Cabin_type_T'].value_counts()

0    890
1      1
Name: Cabin_type_T, dtype: int64