## Importing the Libraries


In [1]:
import pandas as pd
import numpy as np

## Importing the dataset


In [2]:
train_dataset = pd.read_csv('../Data/train.csv')
test_dataset = pd.read_csv('../Data/test.csv')

In [3]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Clean the dataset


In [4]:
train_dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [5]:
train_dataset = train_dataset.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)
test_dataset = test_dataset.drop(["PassengerId", "Name", "Ticket", "Cabin"], axis=1)

In [6]:
train_dataset.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [7]:
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='median')

imputer.fit(train_dataset[["SibSp", "Parch", "Fare", "Age"]])

train_dataset[["SibSp", "Parch", "Fare", "Age"]] = imputer.transform(train_dataset[["SibSp", "Parch", "Fare", "Age"]]) 

In [8]:
test_dataset[["SibSp", "Parch", "Fare", "Age"]] = imputer.fit_transform(test_dataset[["SibSp", "Parch", "Fare", "Age"]]) 

In [9]:
train_dataset.Embarked.fillna("U", inplace=True)
test_dataset.Embarked.fillna("U", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_dataset.Embarked.fillna("U", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_dataset.Embarked.fillna("U", inplace=True)


In [10]:
train_dataset.head(10)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1.0,0.0,7.25,S
1,1,1,female,38.0,1.0,0.0,71.2833,C
2,1,3,female,26.0,0.0,0.0,7.925,S
3,1,1,female,35.0,1.0,0.0,53.1,S
4,0,3,male,35.0,0.0,0.0,8.05,S
5,0,3,male,28.0,0.0,0.0,8.4583,Q
6,0,1,male,54.0,0.0,0.0,51.8625,S
7,0,3,male,2.0,3.0,1.0,21.075,S
8,1,3,female,27.0,0.0,2.0,11.1333,S
9,1,2,female,14.0,1.0,0.0,30.0708,C


In [11]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       891 non-null    float64
 4   SibSp     891 non-null    float64
 5   Parch     891 non-null    float64
 6   Fare      891 non-null    float64
 7   Embarked  891 non-null    object 
dtypes: float64(4), int64(2), object(2)
memory usage: 55.8+ KB


## Encode Categorical Data


In [12]:
from sklearn.preprocessing import LabelEncoder

lable_encoder = LabelEncoder()

data_columns = ["Sex", "Embarked"]


for column in data_columns:
    train_dataset[column] = lable_encoder.fit_transform(train_dataset[column])
    test_dataset[column] = lable_encoder.fit_transform(test_dataset[column])

In [13]:
train_dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1.0,0.0,7.25,2
1,1,1,0,38.0,1.0,0.0,71.2833,0
2,1,3,0,26.0,0.0,0.0,7.925,2
3,1,1,0,35.0,1.0,0.0,53.1,2
4,0,3,1,35.0,0.0,0.0,8.05,2


In [14]:
X = train_dataset.drop("Survived", axis=1)
y = train_dataset["Survived"]

In [15]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [16]:
from sklearn.linear_model import LinearRegression

classifier = LinearRegression()
classifier.fit(X_train, y_train)


In [20]:
from sklearn.ensemble import RandomForestClassifier

classifier_rf = RandomForestClassifier(n_estimators=100, criterion='entropy')

classifier_rf.fit(X_train, y_train)

In [22]:
y_pred_rf = classifier_rf.predict(X_test)

In [23]:
y_pred_rf

array([0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0])

In [17]:
y_pred = classifier.predict(X_test)

In [18]:
y_pred

array([ 0.27073944,  0.61995132,  1.11943984,  0.28602415,  0.50447103,
        0.1060979 ,  1.08243693, -0.03569224,  0.39360367,  0.70105538,
        0.59121864,  0.62710617,  0.18430308,  0.14472753,  0.29920634,
        0.26821407,  0.40249159,  0.29225467,  0.54297827,  0.34618538,
        0.09971238,  0.65535822,  0.75143363,  0.11248967,  0.75063835,
        0.01615258, -0.04787416,  0.6310303 ,  0.09968354,  0.14657065,
        0.09968354,  0.28912646,  0.73874498,  0.18553074,  0.08311775,
        0.6398013 ,  0.19748218,  0.00349689,  0.65286185,  0.30395988,
        0.1383763 ,  1.054599  ,  0.78679967,  0.11024809,  0.16537517,
        0.36786041,  0.60488666,  0.63592959,  0.10624133,  0.74490751,
        0.16380849,  0.78947207,  0.3478461 ,  0.07771857,  0.75493481,
       -0.04140367,  0.09968354,  0.50780238,  0.11254111, -0.02861157,
        0.5390473 ,  0.10607919,  0.68932954,  0.48177368,  0.08599351,
        0.17668478,  0.36976437,  0.17668478,  0.16648804,  0.06

In [24]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, y_pred_rf)

0.8268156424581006