# Fitting a Naive Bayes Model

In [1]:
"""Import modules"""
import pandas as pd
from sklearn.naive_bayes import GaussianNB
import warnings
warnings.filterwarnings('ignore')

  _nan_object_mask = _nan_object_array != _nan_object_array


First Step: Read .csv files

In [2]:
"""Read Csv files"""
dfTrain = pd.read_csv("train.csv")
dfTest = pd.read_csv("test.csv")

Second Step: Handle Age data (An important number of missing values)

In [3]:
dfTrain["CAge"]=pd.cut(dfTrain["Age"], bins = [0,10,18,40,max(dfTrain["Age"])] ,labels=["Child","MYoung","Young","Older"])
dfTest["CAge"]=pd.cut(dfTest["Age"], bins = [0,10,18,40,max(dfTest["Age"])] ,labels=["Child","MYoung","Young","Older"])

Third Step: Handle Categorical Data

In [4]:
"""Make dummy variables for categorical data"""
dfTrain= pd.get_dummies(data = dfTrain, dummy_na=True, prefix= ["Pclass","Sex","Embarked","Age"] ,columns=["Pclass","Sex","Embarked","CAge"])
dfTest= pd.get_dummies(data = dfTest, dummy_na=True, prefix= ["Pclass","Sex","Embarked","Age"] ,columns=["Pclass","Sex","Embarked","CAge"])

"""Store the train outcomes for survived"""
Y_train=dfTrain["Survived"]

Fourth Step: Ignore useless data for the proposed model and handling a missing value

In [5]:
dfTest.head()

Unnamed: 0,PassengerId,Name,Age,SibSp,Parch,Ticket,Fare,Cabin,Pclass_1.0,Pclass_2.0,...,Sex_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Age_Child,Age_MYoung,Age_Young,Age_Older,Age_nan
0,892,"Kelly, Mr. James",34.5,0,0,330911,7.8292,,0,0,...,0,0,1,0,0,0,0,1,0,0
1,893,"Wilkes, Mrs. James (Ellen Needs)",47.0,1,0,363272,7.0,,0,0,...,0,0,0,1,0,0,0,0,1,0
2,894,"Myles, Mr. Thomas Francis",62.0,0,0,240276,9.6875,,0,1,...,0,0,1,0,0,0,0,0,1,0
3,895,"Wirz, Mr. Albert",27.0,0,0,315154,8.6625,,0,0,...,0,0,0,1,0,0,0,1,0,0
4,896,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",22.0,1,1,3101298,12.2875,,0,0,...,0,0,0,1,0,0,0,1,0,0


In [6]:
"""Store PassengerId"""
submission=pd.DataFrame()
submission["PassengerId"]=dfTest["PassengerId"]

"""Ignore useless data"""
dfTrain=dfTrain[dfTrain.columns.difference(["Age","Survived","PassengerId","Name","Ticket","Cabin"])]
dfTest=dfTest[dfTest.columns.difference(["Age","PassengerId","Name","Ticket","Cabin"])]

"""handling a Nan value"""
dfTest["Fare"].iloc[dfTest[dfTest["Fare"].isnull()].index] = dfTest[dfTest["Pclass_3.0"]==1]["Fare"].median()
dfTrain.head()

Unnamed: 0,Age_Child,Age_MYoung,Age_Older,Age_Young,Age_nan,Embarked_C,Embarked_Q,Embarked_S,Embarked_nan,Fare,Parch,Pclass_1.0,Pclass_2.0,Pclass_3.0,Pclass_nan,Sex_female,Sex_male,Sex_nan,SibSp
0,0,0,0,1,0,0,0,1,0,7.25,0,0,0,1,0,0,1,0,1
1,0,0,0,1,0,1,0,0,0,71.2833,0,1,0,0,0,1,0,0,1
2,0,0,0,1,0,0,0,1,0,7.925,0,0,0,1,0,1,0,0,0
3,0,0,0,1,0,0,0,1,0,53.1,0,1,0,0,0,1,0,0,1
4,0,0,0,1,0,0,0,1,0,8.05,0,0,0,1,0,0,1,0,0


Fifth Step: Fit the Naive Bayesian Model

In [7]:
"""Fit Model"""
clf = GaussianNB()
clf.fit(dfTrain,Y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

Last Step: Saving the results in a csv

In [8]:
"""Make a Csv with Results"""
pred = pd.DataFrame(clf.predict(dfTest),columns=["Survived"])
submission=submission.join(pred,how="inner")
submission.to_csv("submit.csv", index=False)
submission.head(10)


Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,0
5,897,0
6,898,0
7,899,0
8,900,1
9,901,0
