<a id="4"></a>
## **<center><span style="color:#00BFC4;"> Import Library </span></center>**

In [2]:
import numpy as np 
import pandas as pd 
import copy
#visualize
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.templates.default = "plotly_dark"

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer,IterativeImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split


import tensorflow as tf
from tensorflow.keras import models,layers,Sequential

<a id="4"></a>
## **<center><span style="color:#00BFC4;"> EDA </span></center>**

In [3]:
train_df = pd.read_csv('../input/spaceship-titanic/train.csv')
test_df = pd.read_csv('../input/spaceship-titanic/test.csv')

In [4]:
train_df.sample(3)
train_df.columns
train_df.shape

(8693, 14)

**Basic Info**
* 13 features 1 label  `('Transported')`
* 8693 samples

<a id="4.1"></a>
### <span style="color:#e76f51;"> Missing Values </span>

In [5]:
train_missing = pd.DataFrame(train_df.isna().sum()).sort_values(by=0,ascending=False)
test_missing = pd.DataFrame(test_df.isna().sum()).sort_values(by=0,ascending=False)

In [6]:
fig = make_subplots(1,2,column_titles=['train','test'],x_title='Missing Values')

fig.add_trace(go.Bar(x=train_missing[0],y=train_missing.index,orientation="h",
                     marker=dict(color=[n for n in range(12)])
                    ),1,1
             )
fig.add_trace(go.Bar(x=test_missing[0],y=test_missing.index,orientation="h",
                     marker=dict(color=[n for n in range(12)])
                    ),1,2
             )
fig.update_layout(showlegend=False, title_text="Missing Values In Train&Test Set", title_x=0.5)

In [7]:
def missing_data(data):
    total = data.isnull().sum()
    percent = (data.isnull().sum()/data.isnull().count()*100)
    tt = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    types = []
    for col in data.columns:
        dtype = str(data[col].dtype)
        types.append(dtype)
    tt['Types'] = types
    return(tt)

In [8]:
missing_info = missing_data(train_df)
missing_info.query('Types=="object"')
missing_info.query('Types=="float64"')

Unnamed: 0,Total,Percent,Types
Age,179,2.059128,float64
RoomService,181,2.082135,float64
FoodCourt,183,2.105142,float64
ShoppingMall,208,2.39273,float64
Spa,183,2.105142,float64
VRDeck,188,2.16266,float64


* Catagorical features:PassengerId,HomePlanet,CryoSleep,Cabin,Destination,VIP,Name
* Numerical features:Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck

<a id="4.1"></a>
### <span style="color:#e76f51;"> Visualize and Analyze Data Relationships </span>


*  Let's Explore the relationship between `'RoomService'` and `'Transported'`

As can be seen from the figure below, `'RoomService'` is generally higher among those who `'Transported'=False` than those who `'Transported'=True` 

In [9]:
_template = dict(layout=go.Layout(font=dict(family='Frankling Gothic', size=12), width=1000))
fig = px.box(train_df, y='RoomService', color='Transported', points='all', notched=True, )
fig.update_traces(quartilemethod='exclusive')
fig.update_layout(template=_template,title='RoomService Distribution')
fig.show()

There are `Five` features`'RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'` associated with onboard costs.


We calculate the sum of the above five features and name `'Total_cost'` to replace them after impute all missing values in `Five` features

In [10]:
# missing values
num_impute_col = ['Age','RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
num_imputer = SimpleImputer(strategy='mean')
train_df[num_impute_col] = num_imputer.fit_transform(train_df[num_impute_col])
test_df[num_impute_col] = num_imputer.fit_transform(test_df[num_impute_col])

train_df = train_df.fillna(axis=0,method='ffill')
test_df = test_df.fillna(axis=0,method='ffill')

In [11]:
# ‘Total_cost’ feature 
def calulate_total(df):
    total_cost = df['RoomService']+df['FoodCourt']+df['Spa']+df['VRDeck']+df['ShoppingMall']
    df = df.drop(['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck'],axis=1)
    df['Total_cost'] = total_cost
    return df
train_df = calulate_total(train_df)
test_df = calulate_total(test_df)

Now let's expolre the relationship between `'Total_cost'` and `'Transported'`

In [12]:
fig = px.box(train_df, y='Total_cost', color='Transported', points='all', notched=True, )
fig.update_traces(quartilemethod='exclusive')
fig.update_layout(template=_template,title='Total cost Distribution')
fig.show()

*  Label Distribution

In [13]:
label_series = train_df['Transported'].value_counts()

In [14]:
fig = go.Figure(go.Bar(x=label_series.index,y=label_series.values))
fig.update_xaxes(title="Survive Or Not")
fig.update_yaxes(title="Count")

*  Age Distribution

In [15]:
fig = go.Figure()
fig.add_trace(go.Histogram(x=train_df['Age'],name='train'))
fig.add_trace(go.Histogram(x=test_df['Age'],name='test'))
fig.update_layout(title_text='Age Distributon',xaxis_title_text='Age',yaxis_title_text='Count',barmode='stack')
fig.show()

Explore the relationship between `'Age'` and `'Transported'`

In [16]:
series1 = train_df.query('Transported == 0')['Age']
series2 = train_df.query('Transported == 1')['Age']

fig = go.Figure()
fig.add_trace(go.Histogram(x=series1,name='Transported:False'))
fig.add_trace(go.Histogram(x=series2,name='Transported:True'))
fig.update_layout(title_text='Transported Or Not Age Distributon',xaxis_title_text='Value',yaxis_title_text='Count',barmode='stack')
fig.show()

<a id="4"></a>
## **<center><span style="color:#00BFC4;"> Data Preprocessing </span></center>**

<a id="4.1"></a>
### <span style="color:#e76f51;"> Encoding</span>

In [17]:
label_cols = ["HomePlanet", "CryoSleep","Cabin", "Destination" ,"VIP"]
def label_encoder(train_df,test_df,columns):
    for col in columns:
        train_df[col] = train_df[col].astype(str)
        test_df[col] = test_df[col].astype(str)
        train_df[col] = LabelEncoder().fit_transform(train_df[col])
        test_df[col] =  LabelEncoder().fit_transform(test_df[col])
    return train_df, test_df

train_df ,test_df = label_encoder(train_df,test_df ,label_cols)

<a id="4.1"></a>
### <span style="color:#e76f51;"> Drop some cols</span>

In [18]:
train_df.drop(['Cabin','Name','PassengerId'],axis=1,inplace=True)
test_df.drop(['Cabin','Name','PassengerId'],axis=1,inplace=True)

In [19]:
train_df.sample()

Unnamed: 0,HomePlanet,CryoSleep,Destination,Age,VIP,Transported,Total_cost
6648,0,0,2,16.0,0,False,891.0


<a id="4.1"></a>
### <span style="color:#e76f51;"> Dataset spliting</span>

In [20]:
X = train_df.drop('Transported',axis=1)
y=train_df['Transported']
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2)

In [21]:
X_train = X_train.to_numpy()
y_train = y_train.to_numpy().reshape(-1,1)
X_test = X_test.to_numpy()
y_test = y_test.to_numpy().reshape(-1,1)

<a id="4"></a>
## **<center><span style="color:#00BFC4;"> Modeling </span></center>**

<a id="4.1"></a>
### <span style="color:#e76f51;"> Training</span>

In [22]:
model = models.Sequential([
        layers.Dense(units=16, activation='relu', input_shape=[6,]),
        layers.Dense(units=32, activation='relu'),
        layers.Dense(units=8, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.01)),
        layers.Dense(units=1, activation='sigmoid')
])

2022-11-17 06:58:13.500206: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


In [23]:
model.compile(optimizer='adam',
            loss='binary_crossentropy',
            metrics=['acc'])
history = model.fit(X_train,y_train,validation_data=(X_test,y_test),
                    batch_size= 64,
                    epochs= 50,
                   )

2022-11-17 06:58:13.705739: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [24]:
his_df = pd.DataFrame(history.history)
hisfig = px.line(his_df,y=['acc','val_acc'],markers=True)
hisfig.show()

<a id="4.1"></a>
### <span style="color:#e76f51;"> Predicting</span>

In [25]:
predictions = model.predict(test_df)

In [26]:
sub = pd.read_csv('../input/spaceship-titanic/sample_submission.csv')
sub['Transported'] = predictions
sub['Transported'] = sub['Transported'].map(lambda x:True if x>=0.5 else False)
sub.to_csv('submission.csv',index=False)