In [1]:
# importing packages to be used
import src.extraction as extract
import src.cleaning as clean
import src.visualization as viz

"""
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow_decision_forests as tfdf
"""
import plotly.express as px
import plotly.graph_objects as go


In [2]:
#importing data
df = extract.get_data("data/train.csv")

For the EDA portion...

In [3]:
# visualize original data
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [4]:
# Get some numerical estimates
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [5]:
# Check how data is organized.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [6]:
# Missing data 
df.isnull().sum().sort_values(ascending=False)

CryoSleep       217
ShoppingMall    208
VIP             203
HomePlanet      201
Name            200
Cabin           199
VRDeck          188
FoodCourt       183
Spa             183
Destination     182
RoomService     181
Age             179
PassengerId       0
Transported       0
dtype: int64

In [7]:
df.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [8]:
df.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [9]:
df.CryoSleep.unique()

array([False, True, nan], dtype=object)

In [10]:
df.VIP.unique()

array([False, True, nan], dtype=object)

In [11]:
# Split Cabin code into 3 subgroups and Passenger Id into 2.
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df[['group', 'number']] = df['PassengerId'].str.split('_', expand=True)

In [12]:
df.Deck.unique()

array(['B', 'F', 'A', 'G', nan, 'E', 'D', 'C', 'T'], dtype=object)

In [13]:
df.Side.unique()

array(['P', 'S', nan], dtype=object)

In [14]:
# Calculate total expenditure per passenger
df['TotalExpenses'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [15]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,group,number,TotalExpenses
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,1,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1,1091.0


In [16]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpenses
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,1440.866329
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,2803.045694
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,716.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,1441.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,35987.0


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8492 non-null   object 
 2   CryoSleep      8476 non-null   object 
 3   Cabin          8494 non-null   object 
 4   Destination    8511 non-null   object 
 5   Age            8514 non-null   float64
 6   VIP            8490 non-null   object 
 7   RoomService    8512 non-null   float64
 8   FoodCourt      8510 non-null   float64
 9   ShoppingMall   8485 non-null   float64
 10  Spa            8510 non-null   float64
 11  VRDeck         8505 non-null   float64
 12  Name           8493 non-null   object 
 13  Transported    8693 non-null   bool   
 14  Deck           8494 non-null   object 
 15  Num            8494 non-null   object 
 16  Side           8494 non-null   object 
 17  group          8693 non-null   object 
 18  number  

In [18]:
df.TotalExpenses.sum()

12525451.0

In [19]:
df.CryoSleep.value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

After I explored a little, I encapsulated the code dividing in 2 moments: visualizations and preparation for the model. 
So we restart!

In [20]:
df = extract.get_data("data/train.csv")
df_viz = clean.data_viz(df)

In [21]:
df_viz.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side,Group,TotalExpenses
5767,6111_02,Europa,False,TRAPPIST-1e,32.0,False,0.0,0.0,0.0,4588.0,0.0,False,A,58,P,6111,4588.0
2182,2336_02,Europa,False,TRAPPIST-1e,44.0,True,1409.0,910.0,17.0,1452.0,3782.0,False,A,30,S,2336,7570.0
6048,6400_01,Europa,False,55 Cancri e,40.0,False,0.0,1677.0,0.0,109.0,320.0,True,A,76,S,6400,2106.0
6049,6400_02,Europa,False,55 Cancri e,34.0,True,0.0,618.0,0.0,0.0,10888.0,False,A,76,S,6400,11506.0
6067,6413_01,Europa,False,TRAPPIST-1e,65.0,,0.0,1420.0,,85.0,827.0,False,A,77,S,6413,2332.0


In [22]:
extract.save_csv("data/clean_viz.csv",df_viz)


In [23]:
viz.viz2(df_viz)

In [24]:
viz.viz1(df_viz)

In [25]:
viz.viz3(df_viz)

In [26]:
viz.viz4(df_viz)

In [27]:
viz.viz5(df_viz)

In [28]:
viz.viz6(df_viz)

In [30]:
viz.xy_boxplot(df_viz,'TotalExpenses','Deck' ,'Deck')

In [31]:
viz.simple_boxplot(df_viz,'Age' ,'Deck')

In [32]:
viz.simple_boxplot(df_viz,'Age' ,'CryoSleep')

# Exploring Expenses in general

In [33]:
viz.viz8(df_viz)

In [43]:
viz.xy_boxplot(df_viz,'TotalExpenses','HomePlanet' ,'Destination')

In [35]:
viz.simple_boxplot(df_viz,'TotalExpenses' ,'CryoSleep')

In [36]:
viz.simple_boxplot(df_viz,'TotalExpenses' ,'VIP')

In [37]:
# Create box charts for passenger counts by deck and destination
box_expensesHP = px.box(df_viz, x='HomePlanet', y='TotalExpenses', color='HomePlanet',
              title="Passenger's Total Expenses Distribution by Home Planet")

# Show the plot
box_expensesHP.show()

In [38]:
viz.simple_boxplot(df_viz,'TotalExpenses' ,'Destination')

In [39]:
viz.simple_boxplot(df_viz,'TotalExpenses' ,'Transported')

In [40]:
# Create box charts for passenger counts by deck and destination
box_homeAge = px.box(df, x='Deck', y='Age', color='Deck', facet_col='Destination',
              title="Passenger's age Distribution by Deck and Destination", facet_col_wrap=1)
box_homeAge.update_layout(
    width=1100,  # Set the width to your desired value
    height=1100,  # Set the height to your desired value
)
# Show the plot
box_homeAge.show()

In [41]:
# Create box charts for passenger counts by deck and destination
box_homeAge = px.box(df_viz, x='Deck', y='Age', color='Deck', facet_col='HomePlanet',
              title="Passenger's age Distribution by Deck and Home Planet", facet_col_wrap=1)

# Show the plot
box_homeAge.show()

In [42]:
"""# Create a treemap chart
Cryo = px.treemap(df_viz, path=['CryoSleep', 'HomePlanet'])

# Show the plot
Cryo.show()
"""

ValueError: ('None entries cannot have not-None children', CryoSleep       nan
HomePlanet    Earth
Name: 2260, dtype: object)

In [None]:
#df_model = clean.train_transform(df_viz)

In [None]:
#df_model.head()

In [None]:
#df_model.info()

In [None]:
"""
# Split the data for train-test
X = df_model.drop("Transported", axis=1)
y = df_model["Transported"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
"""

'\n# Split the data for train-test\nX = df_model.drop("Transported", axis=1)\ny = df_model["Transported"]\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)\n'

In [None]:
#sbn.heatmap(data = X.corr())

In [None]:

# model = tfdf.keras.RandomForestModel()


In [None]:
"""
# Train the classifier on the training data
clf.fit(X, y)

test_sub = extract.get_data("data/test.csv")
test_viz = clean.data_viz(test_sub)
test_model = clean.train_transform(test_viz)

# Make predictions on the test data
y_pred = clf.predict(test_model)
"""

'\n# Train the classifier on the training data\nclf.fit(X, y)\n\ntest_sub = extract.get_data("data/test.csv")\ntest_viz = clean.data_viz(test_sub)\ntest_model = clean.train_transform(test_viz)\n\n# Make predictions on the test data\ny_pred = clf.predict(test_model)\n'

In [None]:
"""
y_pred
test_sub = extract.get_data("data/test.csv")
test_sub['Transported'] = y_pred
my_submission = test_sub[["PassengerId", "Transported"]]
"""

'\ny_pred\ntest_sub = extract.get_data("data/test.csv")\ntest_sub[\'Transported\'] = y_pred\nmy_submission = test_sub[["PassengerId", "Transported"]]\n'