In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import cross_val_predict,cross_val_score

In [2]:
df = pd.read_csv(r'C:\Users\Lenovo\Desktop\34\Kaggle\Titanic\train.csv')
df_test = pd.read_csv(r'C:\Users\Lenovo\Desktop\34\Kaggle\Titanic\test.csv')
passId = df_test["PassengerId"]

In [3]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
# How to fill age class based on prefix 
def prefix(df):
    tempNew = df["Name"].str.split('.', expand = True)[0]
    df["addPrefix"] = (tempNew.str.split(',', expand = True)[1])
    df["addPrefix"].replace({"Ms":"Miss"}, regex = True, inplace = True)
    return df

In [5]:
df = prefix(df)
df_test = prefix(df_test)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
 12  addPrefix    891 non-null    object 
dtypes: float64(2), int64(5), object(6)
memory usage: 90.6+ KB


In [7]:
df["Cabin"].fillna("N", inplace= True)
df_test["Cabin"].fillna("N", inplace= True)

In [8]:
class FillAgeNaN(BaseEstimator, TransformerMixin):
    def fit(self,X, y = None):
        return self
    def transform(self,X, y = None):
        X["Age"] = X.groupby("addPrefix")["Age"].transform(lambda x: x.fillna(x.mean()))
        return X.drop(["addPrefix"], axis = 1)

In [9]:
age_ix,fare_ix,sex_ix,parch_ix,sibsp_ix = 0, 1, 2, 3, 4
class CombinedAttributesAdderNUMS(BaseEstimator, TransformerMixin):
    def __init__ (self, addAgeModify = True, addFareModify = True, addFarePerPerson = True, addFareAge = True):
        self.addAgeModify = addAgeModify
        self.addFareModify = addFareModify
        self.addFarePerPerson = addFarePerPerson
        self.addFareAge = addFareAge
    def fit(self, X, y=None):
        return self
    def transform(self, X, y = None):
        X_new = X      
        if self.addAgeModify:
            addAgeModify = X[:, age_ix] // 5 * 5
            X_new= np.c_[X_new,addAgeModify]
        if self.addFareModify:
            addFareModify = X[:, fare_ix] // 25 * 5
            X_new= np.c_[X_new, addFareModify]
        if self.addFarePerPerson:
            addFarePerPerson = (X[:, fare_ix]/(X[:, parch_ix]+X[:, sibsp_ix]+1))//25*5
            X_new= np.c_[X_new, addFarePerPerson]
        if self.addFareAge:
            addFareAge = (X[:,fare_ix]*X[:, age_ix])//1500*6
            X_new= np.c_[X_new, addFareAge]
        return X_new

In [10]:
class DropAttributesNums(BaseEstimator, TransformerMixin):
        def fit(self, X, y = None):
            return self
        def transform(self, X, y= None):
            return X[:, 4:]

In [24]:
name_ix, cabin_ix = 0,1
class CombinedAttributesAdderSTR(BaseEstimator, TransformerMixin):
    def __init__(self, addPrefix = True, addCabinSpace = True, addPrefixCabin=True, addRelativesModify = True):
        self.addPrefix = addPrefix
        self.addRelativesModify = addRelativesModify
        self.addCabinSpace = addCabinSpace
        self.addPrefixCabin = addPrefixCabin
    def fit(self, X, y=None):
        return self
    def transform(self, X, y=None):
        X_new = X.copy()
        if self.addPrefix:
            tempNew = X["Name"].str.split('.', expand = True)[0]
            X_new["addPrefix"] = (tempNew.str.split(',', expand = True)[1])
            X_new["addPrefix"].replace({"Ms":"Miss"}, regex = True, inplace = True)
        if self.addCabinSpace:
            X_new["CabinSpace"] = X["Cabin"].astype(str).str[0]
        if self.addRelativesModify:
            addRelativesModify = X["SibSp"]+ X["Parch"]
            addRelativesModify[addRelativesModify == 0] = 0
            addRelativesModify[(addRelativesModify > 0 )& (addRelativesModify < 3)] = 1
            addRelativesModify[addRelativesModify == 3] = 2
            addRelativesModify[(addRelativesModify) > 3 & (addRelativesModify < 7)] = 3
            addRelativesModify[addRelativesModify >= 7] = 4 
            X_new= np.c_[X_new, addRelativesModify]

            
        return X_new

In [12]:
class DropAttributesStr(BaseEstimator, TransformerMixin):
        def fit(self, X, y = None):
            return self
        def transform(self, X, y= None):
            return X[:, 2:]

In [13]:
class FrequentImputer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.most_frequent_ = pd.Series([X[c].value_counts().index[0] for c in X],
                                        index=X.columns)
        return self
    def transform(self, X, y=None):
        return X.fillna(self.most_frequent_)

In [14]:
str_pipeline = Pipeline([
    ('imputer', FrequentImputer()),
    ("attrb_adder", CombinedAttributesAdderSTR(addPrefix = True, 
                                               addCabinSpace = True, 
                                               addRelativesModify = True)),
    ("atrrb_drop", DropAttributesStr()),
    ("one_hot_encoder", OneHotEncoder(handle_unknown='ignore', sparse=False))
])

In [25]:
num_pipeline = Pipeline([
    ('fillAge',FillAgeNaN()),
    ('imputer', SimpleImputer(strategy="median")),
    ("attrb_adder", CombinedAttributesAdderNUMS(addAgeModify = True, 
                                                addFareModify = False, 
                                                addFarePerPerson = False, 
                                                addFareAge = False)),
    ("atrrb_drop", DropAttributesNums()),
    ("scale", MinMaxScaler())
    
])

In [26]:
df_nums = df[["Age","Fare","SibSp","Parch","addPrefix"]]
df_str = df[["Name","Cabin","Sex","SibSp","Parch","Pclass","Embarked"]]

full_pipeline = ColumnTransformer([
    ("str", str_pipeline, list(df_str)),
    ("num", num_pipeline, list(df_nums)),
])

In [27]:
X = full_pipeline.fit_transform(df.drop(["Survived"], axis = 1))
Y = df["Survived"]
X_test_real = full_pipeline.transform(df_test)


In [28]:
X_test_real.shape,X.shape

((418, 50), (891, 50))

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42, test_size = 0.2)

In [30]:
def model_check(clf, X = X_train, Y=Y_train):
    clf.fit(X, Y)
    Y_pred = cross_val_predict(clf, X, Y, cv = 10)
    print(classification_report(Y, Y_pred))
    print("The accuracy score", cross_val_score(clf, X,Y, cv = 10).mean())
    print("F1 Score:" , f1_score(Y, Y_pred))
    print("The precision score", precision_score(Y,Y_pred))
    print("The recall score", recall_score(Y,Y_pred))
    return Y_pred,clf

In [31]:
def model_pred(clf, X = X_test, Y=Y_test):
    Y_pred = cross_val_predict(clf, X, Y, cv = 10)
    print(classification_report(Y, Y_pred))
    print("The accuracy score", cross_val_score(clf, X,Y, cv = 10).mean())
    print("F1 Score:" , f1_score(Y, Y_pred))
    print("The precision score", precision_score(Y,Y_pred))
    print("The recall score", recall_score(Y,Y_pred))
    return Y_pred,clf


In [32]:
#Logistic Regression 
from sklearn.linear_model import LogisticRegression
clf_log = LogisticRegression()
Y_pred, clf_log = model_check(clf_log)
Y_pred_test = model_pred(clf_log)
Y_pred_test = model_check(LogisticRegression(), X, Y)

              precision    recall  f1-score   support

           0       0.84      0.87      0.86       444
           1       0.78      0.72      0.75       268

    accuracy                           0.82       712
   macro avg       0.81      0.80      0.80       712
weighted avg       0.82      0.82      0.82       712

The accuracy score 0.8174295774647888
F1 Score: 0.7490347490347491
The precision score 0.776
The recall score 0.7238805970149254
              precision    recall  f1-score   support

           0       0.84      0.81      0.83       105
           1       0.74      0.78      0.76        74

    accuracy                           0.80       179
   macro avg       0.79      0.80      0.79       179
weighted avg       0.80      0.80      0.80       179

The accuracy score 0.7990196078431373
F1 Score: 0.7631578947368421
The precision score 0.7435897435897436
The recall score 0.7837837837837838
              precision    recall  f1-score   support

           0       0

In [34]:
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(max_iter=10000, solver = 'lbfgs')
Y_pred, clf_mlp = model_check(clf_mlp)
Y_pred_test = model_pred(clf_mlp)
Y_pred_test = model_check(clf_mlp, X, Y)

              precision    recall  f1-score   support

           0       0.82      0.87      0.84       444
           1       0.76      0.68      0.71       268

    accuracy                           0.80       712
   macro avg       0.79      0.77      0.78       712
weighted avg       0.79      0.80      0.79       712

The accuracy score 0.7838615023474178
F1 Score: 0.7140039447731756
The precision score 0.7573221757322176
The recall score 0.6753731343283582
              precision    recall  f1-score   support

           0       0.82      0.80      0.81       105
           1       0.72      0.74      0.73        74

    accuracy                           0.78       179
   macro avg       0.77      0.77      0.77       179
weighted avg       0.78      0.78      0.78       179

The accuracy score 0.7549019607843137
F1 Score: 0.7333333333333334
The precision score 0.7236842105263158
The recall score 0.7432432432432432
              precision    recall  f1-score   support

       

In [89]:
clf = LogisticRegression()
clf.fit(X,Y)
Y_pred_test = clf.predict(X_test_real)

In [90]:
output = pd.DataFrame({'PassengerId': passId,'Survived': Y_pred_test})
output.to_csv('submission.csv', index=False)
output.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,1
2,894,0
3,895,0
4,896,1
