In [227]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [228]:
#LOAD DATA
train = pd.read_csv('../input/spaceship-titanic/train.csv')
test = pd.read_csv('../input/spaceship-titanic/test.csv')
print("Done")

In [229]:
#INSPECT DATA
print("Shape of training data is"+str(train.shape))
print("Shape of training data is"+str(test.shape))

In [230]:
#Unequal number of columns due to 'Transported' column missing
train.info()
test.info()

Inspect data

In [231]:
train.describe(include=['object'])


In [232]:
#DATA VISUALISATION 
import seaborn as sns
%matplotlib inline
from matplotlib import pyplot as plt
from matplotlib import style

grid = sns.FacetGrid(train, col='Transported', row='VIP', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend();

In [233]:
#The proportion size of passengers'home planet
planet_count = train["HomePlanet"].value_counts()
label=["Earth", "Europa", "Mars"]
plt.pie(planet_count, labels=label)
plt.show()

In [234]:
train.groupby(['Transported']).VIP.value_counts()
#Similar proportion of VIPs to nonVIPs for transport

In [235]:
#DATA TREATMENT
#observe all NaN values in each column
train.isnull().sum()

In [236]:
#PassengerId has unique labels- drop
#Name has unique labels- drop
#Initial plan for Cabin is to split into different deck/num/sides but finds problem with data -To be fixed

def data_treat(df):
    
    df.drop(columns=['PassengerId', 'Name', 'Cabin'], inplace=True)
    
    return df
    

In [237]:
data_treat(train)
data_treat(test)
train.head()

In [238]:
#Follow missing values of <10% to consider imputation
#will remove if exceeds 10%

train_cols = train.isnull().sum().to_dict()
empty_cols = []
for i,v in enumerate(train_cols):
    if train_cols[v]>(0.1*train.shape[0]):
        empty_cols.append(v)

print(f"Columns containing more than 10% missing values in dataset is {len(empty_cols)}")


In [240]:
from sklearn import preprocessing

def feature_eng(df):
    
    le = preprocessing.LabelEncoder()
    df_obj = df.describe(include=['object'])
    #label as str type to not be categorised as float values
    df[df_obj.columns]= df[df_obj.columns].astype(str)   
    df[df_obj.columns] = df[df_obj.columns].apply(le.fit_transform)

    #Fill up NaN values with median of columns
    df = df.fillna(df.median())
    
    return df
    

In [241]:
feature_eng(train)
feature_eng(test)

In [242]:
def int_mean(df):
    
    mean_age =round(df['Age'].mean())
    mean_rs = round(df['RoomService'].mean())
    mean_sm = round(df['ShoppingMall'].mean())
    mean_fc = round(df['FoodCourt'].mean())
    mean_spa  =round(df['Spa'].mean())
    mean_vrd = round(df['VRDeck'].mean())
    df['Age'].fillna(value=mean_age, inplace=True)
    df['RoomService'].fillna(value=mean_rs, inplace=True)
    df['ShoppingMall'].fillna(value=mean_sm, inplace=True)
    df['FoodCourt'].fillna(value=mean_fc, inplace=True)
    df['Spa'].fillna(value=mean_spa, inplace=True)
    df['VRDeck'].fillna(value=mean_vrd, inplace=True)

    return df

In [243]:
int_mean(train)
int_mean(test)

In [244]:
#Making sure all values are filled
train.isnull().sum()


Making a Dataframe of the interested variables

In [245]:
#checking for null values
test.isnull().sum()

In [263]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

#DATASET PREPARATION
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,:-1], train.iloc[:,-1:], random_state = 0)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)


In [264]:
X_train.info()
y_test.info()

In [265]:
#IDENTIFYING IMPORTANT VARIABLES

model = RandomForestRegressor()
model.fit(X_train_scaled, y_train)

importance = model.feature_importances_

importances = pd.DataFrame(data={'Attribute':train.columns[:-1],
                                'Importance':importance
                                })

# Summarize feature importance
for i,v in enumerate(importances['Importance']):
    print('Feature: %s, Score: %.5f' % (importances['Attribute'][i],v))

importances = importances.sort_values(by='Importance', ascending=False)

#Plotting valuable features in descending order
plt.bar(x=importances['Attribute'], height=importances['Importance'], color='#087E8B')
plt.title('Feature importances obtained from Random Forest', size=20)
plt.xticks(rotation='vertical')
plt.show()

In [277]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor

fs = ['CryoSleep','Age','FoodCourt','Spa','VRDeck','ShoppingMall']

# define the model
model1 = RandomForestRegressor()

# Train with selected feature based on feature importance
model1.fit(X_train, y_train)
print("Model 1 accuracy:{:.3f}".format(model1.score(X_train, y_train)))
print("Model 1 score on prediction:{:.3f}".format(model1.score(X_test, y_test)))

model2 = DecisionTreeClassifier(random_state=1)
model2.fit(X_train, y_train)
print("Model 2 accuracy:{:.3f}".format(model2.score(X_train, y_train)))
print("Model 2 score on prediction:{:.3f}".format(model2.score(X_test,y_test)))

#DecisionTreeClassifier shows a higher accuracy- need testing for underfitting 
#model2 candidate for high bias