In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import xgboost
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.model_selection import RandomizedSearchCV


In [None]:
train=pd.read_csv('/kaggle/input/titanic/train.csv')
/

In [None]:
train.head()

In [None]:
train.shape

In [None]:
train.isna().sum()

In [None]:
sns.kdeplot(train['Age'])

the age distribution is approximately normal so we will replace the mean values by median or we can use knn imputer to do so.

In [None]:
sns.kdeplot(train['Age'].fillna(train['Age'].median()))

In [None]:
imputer = KNNImputer(n_neighbors=2)
sns.kdeplot(imputer.fit_transform(np.array(train['Age']).reshape(-1,1)))


there is no much difference in knn imputation and median imputation so we can choose either of these two.

In [None]:
train['Age']=train['Age'].fillna(train['Age'].median())

In [None]:
dropcol={'PassengerId','Name','Ticket','Cabin'}
#these features have no impact on the survived target variable
train=train.drop(columns=dropcol,axis=1)

In [None]:
train['Embarked'].value_counts()

for categorical imputation we will choose the most frequent category to replace nan values .

In [None]:
train['Embarked']=train['Embarked'].fillna('S')

In [None]:
train.isna().sum()

In [None]:
train.describe()

In [None]:
train['Survived'].value_counts()

we will encoding the categorical variable.

In [None]:
mapp={'male':1,'female':2}
mape={'S':1,'C':2,'Q':3}
train['Sex']=train['Sex'].map(mapp)
train['Embarked']=train['Embarked'].map(mape)

split the data into independent and dependent variables.

In [None]:
x=train.drop(columns='Survived',axis=1)
y=train['Survived']

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.35,random_state=42)

In [None]:
from sklearn.feature_selection import mutual_info_classif
mutual_info=mutual_info_classif(x_train,y_train)
mutual_info=pd.Series(mutual_info)
mutual_info.index=x.columns
mutual_info.sort_values(ascending=False)

we will scale down the features because the Fare figure is highest .

In [None]:
col=['Sex','Fare','Pclass','Parch']
x_train=x_train[col]
x_test=x_test[col]

In [None]:
sc=StandardScaler()
x_train_res=pd.DataFrame(sc.fit_transform(x_train),columns=x_train.columns)

In [None]:
x_test_res=pd.DataFrame(sc.transform(x_test),columns=x_test.columns)

In [None]:
lr=LogisticRegression()
lr.fit(x_train_res,y_train)

In [None]:
from sklearn.metrics import classification_report,accuracy_score

In [None]:
pred=lr.predict(x_test_res)
print(confusion_matrix(pred,y_test))
print(classification_report(pred,y_test))

In [None]:
acc={}
svm=SVC()
knn=KNeighborsClassifier()
rf=RandomForestClassifier()
boost=xgboost.XGBClassifier()
models=[svm,knn,rf,boost]
acc=[]
for i in models:
    i.fit(x_train_res,y_train)
    acc.append(accuracy_score(y_test,i.predict(x_test_res)))
acc=pd.Series(acc)
acc.index=['svm','knn','rf','xgboost']

acc

with default model values the xgboost gives the highest accuracy.we now done hyperparameter tuning on randomforestclassifier model using Randomized search cv

In [None]:
params={'n_estimators':[100,150,200,250,300],
       'max_depth':[5,6,7],
        'criterion':['gini','entropy'],
        
        'ccp_alpha':[0,0.1,0.2]}
random_search=RandomizedSearchCV(rf,param_distributions=params,cv=5,n_iter=80,n_jobs=-1,verbose=3,scoring='roc_auc')
random_search.fit(x_train_res,y_train)

In [None]:
random_search.best_estimator_

In [None]:
rfmodel=RandomForestClassifier(ccp_alpha=0, max_depth=5, n_estimators=150,criterion='entropy'
                       )
rfmodel.fit(x_train_res,y_train)

In [None]:
accuracy_score(y_test,rfmodel.predict(x_test_res))

In [None]:
params2={'eta':[0.1,0.15,0.2,0.25,0.3],
         'max_depth':[4,5,6,7,8],
         'gamma':[0.1,0.2,0.3],
         'subsample':[0.5,0.6,0.7,0.8,0.9,1],
         'colsample_bytree':[0.5,0.6,0.7,0.8],
         'lambda':[1,2,3,4],
         'alpha':[0,1,2,4],
         
         'n_estimators':[100,150,200,250]
}
xgtuned=RandomizedSearchCV(boost,param_distributions=params2,scoring='roc_auc',cv=5,n_iter=100,verbose=3,n_jobs=-1)
xgtuned.fit(x_train_res,y_train)

In [None]:
xgtuned.best_estimator_

In [None]:
boosttuned=xgboost.XGBClassifier(alpha=1,n_estimators=200,colsample_bytree=0.8,eta=0.2,gamma=0.2,max_depth=6)
boosttuned.fit(x_train_res,y_train)

In [None]:
accuracy_score(y_test,boosttuned.predict(x_test_res))

###### 

# our model is ready ..

In [None]:
test=pd.read_csv('/kaggle/input/titanic/test.csv')

test.head()

In [None]:
dropcol

In [None]:
test=test.drop(columns=dropcol,axis=1)

In [None]:
test.isna().sum()

In [None]:
sns.kdeplot(test['Age'])

In [None]:
s=(imputer.fit_transform(np.array(test['Age']).reshape(-1,1)))
test['Age']=s

In [None]:
sns.kdeplot(test['Fare'])

In [None]:
test['Fare']=test['Fare'].fillna(test['Fare'].median())

In [None]:
mapp,mape

In [None]:
test['Sex']=test['Sex'].map(mapp)
test['Embarked']=test['Embarked'].map(mape)

In [None]:
test

In [None]:
test=test[col]
test

In [None]:
test_res=pd.DataFrame(sc.transform(test),columns=test.columns)

In [None]:
pred=boosttuned.predict(test_res)

In [None]:
subm=pd.read_csv('/kaggle/input/titanic/gender_submission.csv')
subm

In [None]:
sub=pd.DataFrame({'Survived':pred},index=subm.PassengerId)

sub.head()

In [None]:
sub.to_csv('submission.csv')