<a href="https://www.kaggle.com/code/nicolasdefebrer/spaceshiptitanic?scriptVersionId=187725837" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# Anàlisi de les dades: Spaceship Titanic

# 1. Preparació de l'entorn

In [94]:
# Importació de mòduls

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
import re as re

%matplotlib inline

In [None]:
# Càrrega de dades

train = pd.read_csv('/kaggle/input/spaceship-titanic/train.csv')
test = pd.read_csv('/kaggle/input/spaceship-titanic/test.csv')
dades = [train, test]

In [None]:
# Informació dels atributs

train.info()

In [None]:
# Descripció estadística dels atributs numèrics

train.describe()

In [None]:
# Valors nuls

df.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
# Nategem dades

for df in dades:    
    df['HomePlanet'].fillna(df['HomePlanet'].mode()[0],inplace = True)
    df['Age'].fillna(df['Age'].median(), inplace = True)
    df['CryoSleep'].fillna(df['CryoSleep'].mode()[0],inplace = True)
    df['Destination'].fillna(df['Destination'].mode()[0],inplace = True)
    df['VIP'].fillna(df['VIP'].mode()[0],inplace = True)
    df['RoomService'].fillna(df['RoomService'].median(), inplace = True)
    df['FoodCourt'].fillna(df['FoodCourt'].median(), inplace = True)
    df['ShoppingMall'].fillna(df['ShoppingMall'].median(), inplace = True)
    df['Spa'].fillna(df['Spa'].median(), inplace = True)
    df['VRDeck'].fillna(df['VRDeck'].median(), inplace = True)

In [None]:
train_df.isnull().sum()

# 2. Exploració i transformació de les dades

# PassengerID

In [None]:
# Eliminem l'atribut PassengerId

train.drop(['PassengerId'], axis=1, inplace=True)

# Transported

In [None]:
# Percentatge de transportats

transported = [train.loc[train.Transported==1, 'Transported'].count(),train.loc[train.Transported==0, 'Transported'].count()]
fig1, ax1 = plt.subplots()
plt.pie(transported, shadow = True, labels = ["No transportats","Transportats"], explode = (0.1,0))
ax1.axis('equal')
plt.show()

# HomePlanet

In [None]:
# Transportats per planeta natal

train[['HomePlanet', 'Transported']].groupby(['HomePlanet']).mean().T

In [None]:
sns.barplot(x="HomePlanet", y="Transported", data=train)

In [None]:
# Convertim els planetes en números

for df in dades:
    df.loc[df.HomePlanet=='Europa','HomePlanet'] = 1
    df.loc[df.HomePlanet=='Earth','HomePlanet'] = 2
    df.loc[df.HomePlanet=='Mars','HomePlanet'] = 3
    
sns.barplot(x="HomePlanet", y="Transported", data = train)

# Cryosleep

In [None]:
# Transportats segons si estaven en "CryoSleep"

train[["CryoSleep", "Transported"]].groupby(['CryoSleep']).mean().T

In [None]:
sns.barplot(x="CryoSleep",y="Transported", data = train)

# Cabin

# Destination

In [None]:
# Transportats segons la destinació

train[["Destination", "Transported"]].groupby(['Destination']).mean().T

In [None]:
sns.barplot(x="Destination",y="Transported", data = train)

# VIP

In [None]:
# Transportats segons si son VIP

train[["VIP", "Transported"]].groupby(['VIP']).mean().T

In [None]:
sns.barplot(x="VIP",y="Transported", data = train)

# Age

In [None]:
# Distribució d'edats

sns.distplot(train.loc[train.Age.notnull(),'Age'], fit = norm)

In [None]:
# Categorització en grups d'edat

for df in dades:
    df["Age"] = df["Age"].fillna(-0.5)
    bins = [-1,0, 5, 10, 15, 30, 60, np.inf]
    df['AgeGroup'] = pd.cut(df["Age"], bins, labels = range(7))
    df.drop(['Age'], axis=1, inplace=True)

In [None]:
sns.barplot(x="AgeGroup", y="Transported", data = train)

# RoomService

In [None]:
# Distribució dels valors de l'atribut RoomService

sns.distplot(train['RoomService'], fit = norm)

In [None]:
# Transformació logarítmica de les dades

for df in dades:
    df['RoomService'].fillna(df['RoomService'].mean(), inplace = True)
    df['RoomService'] = np.log1p(df['RoomService'])

sns.distplot(train['RoomService'], fit = norm)

# Foodcourt

In [None]:
# Distribució dels valors de l'atribut FoodCourt

sns.distplot(train['FoodCourt'])

In [None]:
# Transformació logarítmica de les dades

for df in dades:
    df['FoodCourt'].fillna(df['FoodCourt'].mean(), inplace = True)
    df['FoodCourt'] = np.log1p(df['FoodCourt'])

sns.distplot(train['FoodCourt'], fit = norm)

# Shopping Mall

# Spa

# VRDeck

# Tarifa total

In [None]:
# Creació de l'atribut Fare

df['Fare'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)
train['Fare'] = train[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [None]:
# Distribució dels valors de l'atribut Fare

sns.distplot(train['Fare'], fit = norm)

In [None]:
# Transformació logarítmica de les dades

for df in dades:
    df['Fare'].fillna(df['Fare'].mean(), inplace = True)
    df['Fare'] = np.log1p(df['Fare'])

sns.distplot(train['Fare'], fit = norm)

In [None]:
# Creació de categories

for df in dades:
    df['FareGroup'] = pd.qcut(df['Fare'], 6, labels=range(6))
    df['FareGroup'] = df['FareGroup'].astype('int')
    df.loc[df.FareGroup<1,'FareGroup'] = 1
    df.loc[(df.FareGroup>1) & (df.FareGroup<4),'FareGroup'] = 2
    df.loc[(df.FareGroup>4) & (df.FareGroup<8),'FareGroup'] = 3
    df.loc[df.FareGroup>8,'FareGroup'] = 4

sns.barplot(x="FareGroup", y="Transported", data=train)

# Correlació

In [None]:
# Anàlisi de correlacions

correlation_matrix = train.corr()
correlation_matrix

plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(correlation_matrix);