In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.feature_selection import mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import Pipeline
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

import warnings 
warnings.filterwarnings("ignore")
imp = IterativeImputer(verbose=2)

def saveFile(passengerIds,scores):
    # Saving the file
    passengerId = []
    survived = []
    i = 0
    for sc in scores:
        passengerId.append(passengerIds.iloc[i])
        survived.append(sc)
        i +=1
    list_of_tuples = list(zip(passengerId, survived)) 
    df = pd.DataFrame(list_of_tuples, columns=['PassengerId', 'Survived']) 
    print(df.describe())
    df.to_csv("submission.csv", index = False)
    print("Finished")
    
def gridSearch(x,y):
    # GRID SEARCH
    pipe = Pipeline([('clf', svm.SVC())])   
    search_space =[{
                         'clf': [RandomForestClassifier()],
                         "clf__min_samples_split": range(2,60,2)
                        
                    },
                    {
                         'clf': [DecisionTreeClassifier()],
                        "clf__max_depth": range(3,60,2)
                    }]
    search = GridSearchCV(pipe, search_space, cv=10,verbose=1,error_score='raise')
    search.fit(train,train_y)
    print(search.best_params_)
    print(search.best_score_)

def mutual_info(df,target):
    for colname in df.select_dtypes("object"):
        df[colname], _ = df[colname].factorize()
        
    discrete_features = df.dtypes == int
    mi_scores = mutual_info_regression(df,target,discrete_features=discrete_features)
    mi_scores = pd.Series(mi_scores,name="MI Scores",index=df.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    mi_scores[::3]
    return mi_scores

def fillna(df):
    df.Age.fillna(df.Age.mean(),inplace=True)
    df.Embarked.fillna(df.Embarked.mode()[0],inplace=True)
    df.Fare.fillna(df.Fare.mean(),inplace=True) 
    df.Cabin.fillna(df.Cabin.mode().iat[0],inplace=True)
    return df

def factorize(df):
    #df.Sex = pd.factorize(df.Sex)[0]
    #df.Cabin = pd.factorize(df.Cabin)[0]
    #df.Embarked = pd.factorize(df.Embarked)[0]
    #df.Name_Title = pd.factorize(df.Name_Title)[0]
    #df.Ticket_code = pd.factorize(df.Ticket_code)[0]
    #df.Ticket = pd.factorize(df.Ticket)[0]
    df = pd.get_dummies(df,drop_first=True)
    return df

def prepare(df):
    df["Cabin" + '_was_missing'] = df.Cabin.isnull()
    df["Age"+ "_was_missing"] = df.Age.isnull()
    df["Family_size"] = df.SibSp + df.Parch
    df["Name_Title"] = df.Name.apply(lambda x: x.split(",")[1].split()[0])
    df["Ticket_code"] = df.Ticket.apply(lambda x: x.split(' ')[0] if x.isdigit() == False  else "N/A")
    df["MeanAgePerTitle"] = df.groupby("Name_Title")["Age"].transform("mean")
    df["MeanSexPerTitle"] = df.groupby("Sex")["Age"].transform("mean")
    ageBins = range(0,int(df.Age.max()),int(df.Age.max() / 5))
    df['Age_binned'] = np.searchsorted(ageBins, df.Age.values)
    df["MeanFareByAgeBin"] = df.groupby("Age_binned")["Fare"].transform("mean")
    df = fillna(df)
    df["Cabin_FirstLetter"] = df.Cabin.str[0]
    df.Age = df.Age.apply(lambda x: int(x))
    df.drop("Name",axis=1,inplace=True)
    df = factorize(df)
    return df
    

titan = "/kaggle/input/titanic/train.csv"
with open(titan) as tra:
    train = pd.read_csv(tra)
with open("/kaggle/input/titanic/test.csv") as tes:
    test = pd.read_csv(tes)
    
#train.drop("Name",axis=1,inplace=True)
train.drop("PassengerId",axis=1,inplace=True)
train_y = train.pop("Survived")

#test.drop("Name",axis=1,inplace=True)
testPassengerIds = test.pop("PassengerId")

train = prepare(train)
test = prepare(test)

train.head(20)

test,train = test.align(train,join='right',axis=1)
test.fillna(False,inplace=True)

#mutual_info(train,train_y)

#gridSearch(train,train_y)

clf = RandomForestClassifier(min_samples_split=4)
clf.fit(train,train_y)
saveFile(testPassengerIds,clf.predict(test))

       PassengerId    Survived
count   418.000000  418.000000
mean   1100.500000    0.375598
std     120.810458    0.484857
min     892.000000    0.000000
25%     996.250000    0.000000
50%    1100.500000    0.000000
75%    1204.750000    1.000000
max    1309.000000    1.000000
Finished
