In [1]:
import numpy as np
import pandas as pd

**Reading the data.**

In [2]:
dftr = pd.read_csv("../data/train.csv")
dftst = pd.read_csv("../data/test.csv")

**Changing index to PassengerId.**

In [3]:
dftr.set_index('PassengerId', inplace=True)
dftst.set_index('PassengerId', inplace=True)

**Checking the data**

In [4]:
dftr.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
dftr.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [6]:
dftr.describe()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [7]:
dftr.skew()

Survived    0.478523
Pclass     -0.630548
Age         0.389108
SibSp       3.695352
Parch       2.749117
Fare        4.787317
dtype: float64

**Dropping the useless Columns**

In [8]:
drp = ['Name', 'Parch', 'Embarked', 'Ticket', 'Fare']
dftr.drop(drp, inplace=True, axis=1)

In [9]:
dftr.head()
dftr.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  891 non-null    int64  
 1   Pclass    891 non-null    int64  
 2   Sex       891 non-null    object 
 3   Age       714 non-null    float64
 4   SibSp     891 non-null    int64  
 5   Cabin     204 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 48.7+ KB


**Changing object types into simpler forms**

In [10]:
dftr['Sex'].replace(['female', 'male'], [0, 1], inplace=True)
dftst['Sex'].replace(['female', 'male'], [0, 1], inplace=True)
dftr['Cabin'] = dftr['Cabin'].astype(str)
dftst['Cabin'] = dftst['Cabin'].astype(str)

In [11]:
dftr.dtypes

Survived      int64
Pclass        int64
Sex           int64
Age         float64
SibSp         int64
Cabin        object
dtype: object

**Extracting Cabin letter from Cabin column**

In [12]:
def substrings_in_string(big_string, substrings):
    for substring in substrings:
        if str.find(big_string, substring) != -1:
            return substring
    return np.nan

In [13]:
cabin_list = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'Unknown']
dftr['Deck'] = dftr['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))
dftst['Deck'] = dftst['Cabin'].map(lambda x: substrings_in_string(x, cabin_list))

In [14]:
dftr.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Cabin,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,1,22.0,1,,
2,1,1,0,38.0,1,C85,C
3,1,3,0,26.0,0,,
4,1,1,0,35.0,1,C123,C
5,0,3,1,35.0,0,,


**Dropping Cabin as it serve no further purpose and checking the null values**

In [15]:
dftr.drop('Cabin', inplace=True, axis=1)
dftst.drop('Cabin', inplace=True, axis=1)


In [16]:
dftr.head()
dftr.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Deck        687
dtype: int64

**Dealing with the null values**

In [17]:
dftr['Deck'] = dftr['Deck'].fillna('Unknown')
dftr.isnull().sum()

Survived      0
Pclass        0
Sex           0
Age         177
SibSp         0
Deck          0
dtype: int64

In [18]:
dftst['Deck'] = dftst['Deck'].fillna('Unknown')
dftst.isnull().sum()

Pclass       0
Name         0
Sex          0
Age         86
SibSp        0
Parch        0
Ticket       0
Fare         1
Embarked     0
Deck         0
dtype: int64

In [19]:
dftr.head()


Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,1,22.0,1,Unknown
2,1,1,0,38.0,1,C
3,1,3,0,26.0,0,Unknown
4,1,1,0,35.0,1,C
5,0,3,1,35.0,0,Unknown


In [20]:
dftr['Age'].interpolate(method='linear', inplace=True, limit_direction='both')
dftst['Age'].interpolate(method='linear', inplace=True, limit_direction='both')

In [21]:
dftr.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Deck        0
dtype: int64

**Checking cor bettween columns**

In [22]:
dftr['Age'].corr(dftr['Survived'])
dftr['Sex'].corr(dftr['Survived'])
dftr['Pclass'].corr(dftr['Survived'])

-0.3384810359610147

**Model Evaluation.**

**Spliting the data**

In [23]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(dftr, test_size=0.2, random_state=0)

In [24]:
train.describe

<bound method NDFrame.describe of              Survived  Pclass  Sex   Age  SibSp     Deck
PassengerId                                             
141                 0       3    0  23.0      0  Unknown
440                 0       2    1  31.0      0  Unknown
818                 0       2    1  31.0      1  Unknown
379                 0       3    1  20.0      0  Unknown
492                 0       3    1  21.0      0  Unknown
...               ...     ...  ...   ...    ...      ...
836                 1       1    0  39.0      1        E
193                 1       3    0  19.0      1  Unknown
630                 0       3    1  53.0      0  Unknown
560                 1       3    0  36.0      1  Unknown
685                 0       2    1  60.0      1  Unknown

[712 rows x 6 columns]>

In [25]:
test.describe

<bound method NDFrame.describe of              Survived  Pclass  Sex        Age  SibSp     Deck
PassengerId                                                  
496                 0       3    1  37.500000      0  Unknown
649                 0       3    1  39.500000      0  Unknown
279                 0       3    1   7.000000      4  Unknown
32                  1       1    0  48.666667      1        B
256                 1       3    0  29.000000      0  Unknown
...               ...     ...  ...        ...    ...      ...
781                 1       3    0  13.000000      0  Unknown
838                 0       3    1  26.500000      0  Unknown
216                 1       1    0  31.000000      1        D
834                 0       3    1  23.000000      0  Unknown
373                 0       3    1  19.000000      0  Unknown

[179 rows x 6 columns]>

**Validation Training**

In [26]:
from sklearn import linear_model

regr = linear_model.Lasso(alpha=0.01)
yev = train['Survived']
Features = ["Pclass", "Sex", "Age", 'Deck']
Xev = pd.get_dummies(train[Features])
Xev.drop('Deck_A', inplace=True, axis=1)
Xev.drop('Deck_T', inplace=True, axis=1)
X_testev = pd.get_dummies(test[Features])
regr.fit(Xev, yev)

Lasso(alpha=0.01)

In [27]:
Xev

Unnamed: 0_level_0,Pclass,Sex,Age,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
141,3,0,23.0,0,0,0,0,0,0,1
440,2,1,31.0,0,0,0,0,0,0,1
818,2,1,31.0,0,0,0,0,0,0,1
379,3,1,20.0,0,0,0,0,0,0,1
492,3,1,21.0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
836,1,0,39.0,0,0,0,1,0,0,0
193,3,0,19.0,0,0,0,0,0,0,1
630,3,1,53.0,0,0,0,0,0,0,1
560,3,0,36.0,0,0,0,0,0,0,1


In [28]:
X_testev

Unnamed: 0_level_0,Pclass,Sex,Age,Deck_B,Deck_C,Deck_D,Deck_E,Deck_F,Deck_G,Deck_Unknown
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
496,3,1,37.500000,0,0,0,0,0,0,1
649,3,1,39.500000,0,0,0,0,0,0,1
279,3,1,7.000000,0,0,0,0,0,0,1
32,1,0,48.666667,1,0,0,0,0,0,0
256,3,0,29.000000,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...
781,3,0,13.000000,0,0,0,0,0,0,1
838,3,1,26.500000,0,0,0,0,0,0,1
216,1,0,31.000000,0,0,1,0,0,0,0
834,3,1,23.000000,0,0,0,0,0,0,1


**Evaluating**

In [29]:
predictions = regr.predict(X_testev)
predictions = np.round(predictions).astype(int)
predictions

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [30]:
predictions

array([0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0])

In [31]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(test['Survived'], predictions)

0.22346368715083798

**Actual Training**

In [32]:
dftr.head()

Unnamed: 0_level_0,Survived,Pclass,Sex,Age,SibSp,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,0,3,1,22.0,1,Unknown
2,1,1,0,38.0,1,C
3,1,3,0,26.0,0,Unknown
4,1,1,0,35.0,1,C
5,0,3,1,35.0,0,Unknown


In [33]:
dftst.head()

Unnamed: 0_level_0,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked,Deck
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,Q,Unknown
893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,S,Unknown
894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,Q,Unknown
895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,S,Unknown
896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,S,Unknown


In [34]:
from sklearn import linear_model

regr = linear_model.Lasso(alpha=0.01)
y = dftr['Survived']
Features = ["Pclass", "Sex", "Age", 'Deck']
X = pd.get_dummies(dftr[Features])
X.drop('Deck_T', inplace=True, axis=1)
X_test = pd.get_dummies(dftst[Features])
regr.fit(X, y)

Lasso(alpha=0.01)

**Predicting**

In [35]:
predictions = regr.predict(X_test)
predictions = np.round(predictions).astype(int)
predictions

array([0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

***Saving output***

In [36]:
output = pd.DataFrame({'PassengerId': dftst.index, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")


Your submission was successfully saved!
