## Import the Libraries

In [37]:
import pandas as pd
import lightgbm as lgb
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## Download Dataset 

In [2]:
!wget https://raw.githubusercontent.com/mananparasher/PML-Machine-Learning-Datasets/master/titanic_dataset.csv

--2020-07-22 19:23:31--  https://raw.githubusercontent.com/mananparasher/PML-Machine-Learning-Datasets/master/titanic_dataset.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.124.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.124.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 61194 (60K) [text/plain]
Saving to: ‘titanic_dataset.csv’


2020-07-22 19:23:32 (543 KB/s) - ‘titanic_dataset.csv’ saved [61194/61194]



In [2]:
df=pd.read_csv("titanic_dataset.csv")
df.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


## Explore the dataset

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


## Data Processing 

In [4]:
df['Age']=df['Age'].fillna(df['Age'].mean())
df['Embarked']=df['Embarked'].fillna('Others')
df=df.drop(columns=['Cabin','Name','Ticket','PassengerId'])
df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


## Data Normalization

In [7]:
ordinalencoder=OrdinalEncoder()
df[['Embarked','Sex']]=ordinalencoder.fit_transform(df[['Embarked','Sex']])

standardccaler=StandardScaler()
df[['Pclass','Age','SibSp','Parch','Fare']]=standardccaler.fit_transform(df[['Pclass','Age','SibSp','Parch','Fare']])

df.head(5)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,0.827377,1.0,-0.592481,0.432793,-0.473674,-0.502445,3.0
1,1,-1.566107,0.0,0.638789,0.432793,-0.473674,0.786845,0.0
2,1,0.827377,0.0,-0.284663,-0.474545,-0.473674,-0.488854,3.0
3,1,-1.566107,0.0,0.407926,0.432793,-0.473674,0.42073,3.0
4,0,0.827377,1.0,0.407926,-0.474545,-0.473674,-0.486337,3.0


## Splitting Data

In [29]:
y=df.pop('Survived')

X_2, X_val, y_2, y_val = train_test_split(df, y, test_size=0.05, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X_2, y_2, test_size=0.05, random_state=42)

## Dataset for LightGBM

In [30]:
categorical_columns=['Embarked','Sex']

training_data = lgb.Dataset(X_train, label=y_train)
validation_data = lgb.Dataset(X_val, label=y_val)

training_data

<lightgbm.basic.Dataset at 0x11904ec90>

## Setting Parameters

In [43]:
param = {'num_leaves': 100, 'objective': 'binary'}
param['metric'] = 'auc'

## Model Training 

In [44]:
num_round = 10
boostermodel = lgb.train(param, training_data, num_round, \
                         valid_sets=validation_data,categorical_feature=categorical_columns)

[1]	valid_0's auc: 0.829218
[2]	valid_0's auc: 0.837449
[3]	valid_0's auc: 0.83642
[4]	valid_0's auc: 0.84465
[5]	valid_0's auc: 0.852881
[6]	valid_0's auc: 0.867284
[7]	valid_0's auc: 0.86214
[8]	valid_0's auc: 0.858025
[9]	valid_0's auc: 0.851852
[10]	valid_0's auc: 0.855967


## Model Predictions

In [50]:
predictions=boostermodel.predict(X_test)
predictions=predictions.round(0)
predictions

array([0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 1., 0., 0., 1., 1., 0., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0.,
       0., 0., 1., 0., 0., 1., 1., 1., 1.])

In [51]:
print(classification_report(predictions,y_test))

              precision    recall  f1-score   support

         0.0       0.91      0.71      0.80        28
         1.0       0.62      0.87      0.72        15

    accuracy                           0.77        43
   macro avg       0.76      0.79      0.76        43
weighted avg       0.81      0.77      0.77        43



## Saving Model

In [None]:
boostermodel.save_model('model.txt')