In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

# Step 1: Investigate the dataset

In [2]:
df = pd.read_csv("D:/projects/CodSoft_Projects/Task1/tested.csv")
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [8]:
df.shape

(418, 12)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB


In [7]:
df.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


#### So, from the above statistics we can conclude that: 
1- Roughly 36 percent of the passenger survived and the rest did not. 

2- The majority of the passenger (at least 50 percent) classes were in the 3rd class.

3- The majority of the passenger ages are between 27 and 39. 

4- Most of the passenger came without parents and children and without sibilings and spouses but there are some children and 76 year olds in the passengers.

5- We don't need the Columns (PassengerId, Name, Ticket) because they are specific to the passengers and won't give us useful informations.

6- We don't need the Column (Cabin) because it has a lot of missing values.

# Step 2: Clean the dataset

## Dropping Unnecessary columns

In [12]:
# Dropping unnecessary columns
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis = 1, inplace = True)
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,34.5,0,0,7.8292,Q
1,1,3,female,47.0,1,0,7.0,S
2,0,2,male,62.0,0,0,9.6875,Q
3,0,3,male,27.0,0,0,8.6625,S
4,1,3,female,22.0,1,1,12.2875,S


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       332 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      417 non-null    float64
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


## Imputing missing values

In [14]:
# Missing Values

missingAge = df['Age'].isna().sum()
missingFare = df['Fare'].isna().sum()

print("Null values in Age column: ", missingAge)
print("Null values in Fare column: ", missingFare)

Null values in Age column:  86
Null values in Fare column:  1


#### We can Impute the missing values in the Age column and the row where the Fare value is missing with the mean 

In [18]:
df.fillna(df.mean(), inplace=True)

  df.fillna(df.mean(), inplace=True)


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      418 non-null    float64
 7   Embarked  418 non-null    object 
dtypes: float64(2), int64(4), object(2)
memory usage: 26.2+ KB


### Encoding the Sex column

In [35]:
df.Sex.unique()

array(['male', 'female'], dtype=object)

In [36]:
sex_mapping = {'male': 0, 'female': 1}

df['Sex'] = df['Sex'].map(sex_mapping)

df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,0,34.5,0,0,7.8292,Q
1,1,3,1,47.0,1,0,7.0,S
2,0,2,0,62.0,0,0,9.6875,Q
3,0,3,0,27.0,0,0,8.6625,S
4,1,3,1,22.0,1,1,12.2875,S


### Encoding the Embarked column

In [42]:
df.Embarked.unique()

array(['Q', 'S', 'C'], dtype=object)

In [48]:
from sklearn.preprocessing import OneHotEncoder

df['Embarked'] = df['Embarked'].astype('category')
df['Emb_new'] = df['Embarked'].cat.codes

enc = OneHotEncoder()

enc_data = pd.DataFrame(enc.fit_transform(df[['Emb_new']]).toarray())
  
# Merge with main
df1 = df.join(enc_data)
df1.drop(['Embarked', 0, 1, 2], axis=1, inplace=True)
df1.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_new
0,0,3,0,34.5,0,0,7.8292,1
1,1,3,1,47.0,1,0,7.0,2
2,0,2,0,62.0,0,0,9.6875,1
3,0,3,0,27.0,0,0,8.6625,2
4,1,3,1,22.0,1,1,12.2875,2
5,0,3,0,14.0,0,0,9.225,2
6,1,3,1,30.0,0,0,7.6292,1
7,0,2,0,26.0,1,1,29.0,2
8,1,3,1,18.0,0,0,7.2292,0
9,0,3,0,21.0,2,0,24.15,2


In [49]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    int64  
 3   Age       418 non-null    float64
 4   SibSp     418 non-null    int64  
 5   Parch     418 non-null    int64  
 6   Fare      418 non-null    float64
 7   Emb_new   418 non-null    int8   
dtypes: float64(2), int64(5), int8(1)
memory usage: 23.4 KB


# Step 3: Splitting the dataset and Building/Evaluating the models

In [37]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline

In [50]:
X = df1.drop(columns=['Survived'], axis=1)
y = df1['Survived']
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Emb_new
0,3,0,34.5,0,0,7.8292,1
1,3,1,47.0,1,0,7.0,2
2,2,0,62.0,0,0,9.6875,1
3,3,0,27.0,0,0,8.6625,2
4,3,1,22.0,1,1,12.2875,2


In [51]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)
len(X_train), len(X_test)

(334, 84)

# Model training

In [52]:
# Making a pipeline to train the data on different models
pipelines = {
    'lr':make_pipeline(LinearRegression()),
    'rf':make_pipeline(RandomForestClassifier()),
    'dt':make_pipeline(DecisionTreeClassifier()),
    'knn':make_pipeline(KNeighborsClassifier())
}

In [53]:
fit_models = {}
for algo , pipeline in pipelines.items():
    model = pipeline.fit(X_train , y_train)
    fit_models[algo] = model

# Evaluating the models

In [68]:
for algo , model in fit_models.items():
    pred = model.predict(X_test)
    print("The mean absolute error of {} model is: {}".format(algo , mean_absolute_error(y_test , pred)))
    print("The mean squared error of {} model is: {}".format(algo , mean_squared_error(y_test , pred)))
    print("The r2 score of {} model is: {}".format(algo , r2_score(y_test , pred)))
    print()

The mean absolute error of lr model is: 4.465555156212308e-15
The mean squared error of lr model is: 2.6031102502103977e-29
The r2 score of lr model is: 1.0

The mean absolute error of rf model is: 0.0
The mean squared error of rf model is: 0.0
The r2 score of rf model is: 1.0

The mean absolute error of dt model is: 0.0
The mean squared error of dt model is: 0.0
The r2 score of dt model is: 1.0

The mean absolute error of knn model is: 0.36904761904761907
The mean squared error of knn model is: 0.36904761904761907
The r2 score of knn model is: -0.5317647058823527



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# The Random Forests and Decision Tree Models are the best models

### Saving our best model

In [69]:
import pickle

with open('D:/projects/CodSoft_Projects/Task1/Titanic.pkl' , 'wb') as f:
    pickle.dump(fit_models['rf'], f)