# Download dataset and setup Kaggle
[How to download a dataset from Kaggle](https://www.kaggle.com/general/74235)

In [1]:
! pip install -q kaggle

You should consider upgrading via the '/Users/lucaswerner/Desktop/hands-on-machine-learning/.env/bin/python3.9 -m pip install --upgrade pip' command.[0m


In [2]:
!mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

mkdir: /Users/lucaswerner/.kaggle: File exists


In [3]:
! kaggle datasets list

ref                                                             title                                              size  lastUpdated          downloadCount  voteCount  usabilityRating  
--------------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  ---------  ---------------  
ahsan81/hotel-reservations-classification-dataset               Hotel Reservations Dataset                        480KB  2023-01-04 12:50:31           4288        161  1.0              
senapatirajesh/netflix-tv-shows-and-movies                      Latest Netflix TV shows and movies                  1MB  2023-01-14 17:03:12           1045         30  0.88235295       
thedevastator/global-fossil-co2-emissions-by-country-2002-2022  Emissions by Country, 2002-2022                   621KB  2023-01-02 20:10:36           2058         68  1.0              
johnny1994/divorce-rates-data-should-you-get-married            Divorc

In [4]:
! kaggle competitions download -c 'spaceship-titanic'

Downloading spaceship-titanic.zip to /workspaces/hands-on-machine-learning/experiments/chapter_02/spaceship_titanic_kaggle
  0%|                                                | 0.00/299k [00:00<?, ?B/s]
100%|████████████████████████████████████████| 299k/299k [00:00<00:00, 4.55MB/s]


# Decompress dataset zip

In [4]:
from pathlib import Path
import pandas as pd
import shutil
import os
DATASET_DIR = "dataset"
Path(DATASET_DIR).mkdir(parents=True, exist_ok=True)
if os.path.exists(os.path.join('spaceship-titanic.zip')):
	shutil.unpack_archive('spaceship-titanic.zip', DATASET_DIR)
	os.remove(os.path.join('spaceship-titanic.zip'))
	


In [5]:
df_train = pd.read_csv(Path(DATASET_DIR+"/"+"train.csv"))
df_test = pd.read_csv(Path(DATASET_DIR+"/"+"test.csv"))

# Analyze dataset

```Cabin``` - The cabin number where the passenger is staying. Takes the form deck/num/side, where side can be either P for Port or S for Starboard.

In [6]:
df_train.head(10)

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
5,0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True
6,0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True
7,0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True
8,0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True
9,0008_01,Europa,True,B/1/P,55 Cancri e,14.0,False,0.0,0.0,0.0,0.0,0.0,Erraiam Flatic,True


In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [9]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4277 entries, 0 to 4276
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   4277 non-null   object 
 1   HomePlanet    4190 non-null   object 
 2   CryoSleep     4184 non-null   object 
 3   Cabin         4177 non-null   object 
 4   Destination   4185 non-null   object 
 5   Age           4186 non-null   float64
 6   VIP           4184 non-null   object 
 7   RoomService   4195 non-null   float64
 8   FoodCourt     4171 non-null   float64
 9   ShoppingMall  4179 non-null   float64
 10  Spa           4176 non-null   float64
 11  VRDeck        4197 non-null   float64
 12  Name          4183 non-null   object 
dtypes: float64(6), object(7)
memory usage: 434.5+ KB


* PassengerId is not relevant for the training part since it contains an identifier for each passenger
* HomePlanet is a categorical value that may be useful
* CryoSleep contains boolean values
* Cabin is a composition of deck/num/side and shows the details of the passenger position
* Destination is a categorical value
* Age is an integer number.
* VIP is a boolean value
* RoomService	FoodCourt	ShoppingMall	Spa are the expenses each passenger made during the trip
* Name is not relevant
* Transported is the label we want to predict

In [14]:
df_train['HomePlanet']

0       Europa
1        Earth
2       Europa
3       Europa
4        Earth
         ...  
8688    Europa
8689     Earth
8690     Earth
8691    Europa
8692    Europa
Name: HomePlanet, Length: 8693, dtype: object

In [None]:
from sklearn.preprocessing import OneHotEncoder


# Prepare the data for the algorithms