In [39]:
# importing packages to be used
import src.extraction as extract
import src.cleaning as clean
import src.visualization as viz

"""
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import tensorflow as tf
"""
import numpy as np
import seaborn as sbn
import plotly.express as px
import plotly.graph_objects as go
import pandas as pd

In [40]:
#importing data
df = extract.get_data("data/train.csv")

In [41]:
# visualize data
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [42]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0
75%,38.0,47.0,76.0,27.0,59.0,46.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [44]:
# Split Cabin code into 3 subgroups and Passenger Id into 2.
df[['Deck', 'Num', 'Side']] = df['Cabin'].str.split('/', expand=True)
df[['group', 'number']] = df['PassengerId'].str.split('_', expand=True)

In [45]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,group,number
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,1
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1


In [46]:
df.Destination.unique()

array(['TRAPPIST-1e', 'PSO J318.5-22', '55 Cancri e', nan], dtype=object)

In [47]:
df.HomePlanet.unique()

array(['Europa', 'Earth', 'Mars', nan], dtype=object)

In [48]:
df.Deck.unique()

array(['B', 'F', 'A', 'G', nan, 'E', 'D', 'C', 'T'], dtype=object)

In [49]:
df.Side.unique()

array(['P', 'S', nan], dtype=object)

In [50]:
# Calculate total expenditure per passenger
df['TotalExpenses'] = df[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum(axis=1)

In [51]:
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Deck,Num,Side,group,number,TotalExpenses
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False,B,0,P,1,1,0.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True,F,0,S,2,1,736.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False,A,0,S,3,1,10383.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False,A,0,S,3,2,5176.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True,F,1,S,4,1,1091.0


In [52]:
df.describe()

Unnamed: 0,Age,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,TotalExpenses
count,8514.0,8512.0,8510.0,8485.0,8510.0,8505.0,8693.0
mean,28.82793,224.687617,458.077203,173.729169,311.138778,304.854791,1440.866329
std,14.489021,666.717663,1611.48924,604.696458,1136.705535,1145.717189,2803.045694
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,19.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,0.0,0.0,0.0,0.0,0.0,716.0
75%,38.0,47.0,76.0,27.0,59.0,46.0,1441.0
max,79.0,14327.0,29813.0,23492.0,22408.0,24133.0,35987.0


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   PassengerId    8693 non-null   object 
 1   HomePlanet     8492 non-null   object 
 2   CryoSleep      8476 non-null   object 
 3   Cabin          8494 non-null   object 
 4   Destination    8511 non-null   object 
 5   Age            8514 non-null   float64
 6   VIP            8490 non-null   object 
 7   RoomService    8512 non-null   float64
 8   FoodCourt      8510 non-null   float64
 9   ShoppingMall   8485 non-null   float64
 10  Spa            8510 non-null   float64
 11  VRDeck         8505 non-null   float64
 12  Name           8493 non-null   object 
 13  Transported    8693 non-null   bool   
 14  Deck           8494 non-null   object 
 15  Num            8494 non-null   object 
 16  Side           8494 non-null   object 
 17  group          8693 non-null   object 
 18  number  

In [54]:
df.TotalExpenses.sum()

12525451.0

In [55]:
df.CryoSleep.value_counts()

CryoSleep
False    5439
True     3037
Name: count, dtype: int64

After I explored a little, I encapsulated the code dividing in 2 moments: visualizations and preparation for the model. 
So we restart!

In [56]:
df = extract.get_data("data/train.csv")
df_viz = clean.data_viz(df)

In [57]:
df_viz.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Deck,Num,Side,Group,TotalExpenses
750,0789_01,Europa,False,TRAPPIST-1e,31.0,True,0.0,166.0,4.0,2177.0,342.0,False,A,8,S,789,2689.0
741,0777_01,Europa,False,TRAPPIST-1e,30.0,False,0.0,3004.0,0.0,1018.0,0.0,False,A,5,P,777,4022.0
1177,1243_02,Europa,False,TRAPPIST-1e,32.0,False,8168.0,69.0,11.0,371.0,0.0,False,A,9,P,1243,8619.0
751,0789_02,Europa,False,TRAPPIST-1e,30.0,False,0.0,142.0,870.0,89.0,3497.0,False,A,8,S,789,4598.0
752,0789_03,Europa,True,TRAPPIST-1e,26.0,False,0.0,0.0,0.0,0.0,0.0,True,A,8,S,789,0.0


In [58]:
extract.save_csv("data/clean_viz.csv",df_viz)
df_viz.sort_values(by='HomePlanet', inplace=True)
df_viz.sort_values(by='Deck', inplace=True)

In [59]:
# Create a count plot of ages using Plotly Express
count_age = px.histogram(df_viz, x='Age', title='Count Plot of Ages')
count_age.update_xaxes(title='Age')
count_age.update_yaxes(title='Count')

# Show the plot
count_age.show()

In [93]:
pie_Transp = px.pie(df_viz, names='Transported', title="Passenger's Transported")
pie_Transp.update_layout(
    width=1100,  # Set the width to your desired value
    height=400,  # Set the height to your desired value
)

# Show the plot
pie_Transp.show()

In [60]:
import plotly.graph_objects as go
import pandas as pd

transpcolors = ['#66B2FF','#FF9999']
colors = ['#FF9999', '#66B2FF']
# Create subplots with 1 row and 3 columns
fig = go.Figure()

# Create a pie chart for 'Transported'
fig.add_trace(go.Pie(
    labels=['Transported', 'NOT Transported'],
    values=df_viz['Transported'].value_counts(),
    name='Transported',
    title='Transported',
    domain={'x': [0, 0.3]},
    marker=dict(colors=transpcolors)
))

# Create a pie chart for 'VIP'
fig.add_trace(go.Pie(
    labels=['NOT VIP', 'VIP'],
    values=df_viz['VIP'].value_counts(),
    name='VIP',
    title='VIP',
    domain={'x': [0.35, 0.65]},
     marker=dict(colors=colors)
))

# Create a pie chart for 'CryoSleep'
fig.add_trace(go.Pie(
    labels=['NOT CryoSleep', 'CryoSleep'],
    values=df_viz['CryoSleep'].value_counts(),
    name='CryoSleep',
    title='CryoSleep',
    domain={'x': [0.7, 1]},
    marker=dict(colors=colors)
))

# Update layout
fig.update_layout(
    title='Pie Charts for Transported, VIP, and CryoSleep',
    grid={'rows': 1, 'columns': 3},
)

# Show the plot
fig.show()



In [86]:
amenities = df_[['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']].sum()
total_spent = amenities.sum()

# Create a pie chart with the sums
pie_expense = px.pie(
    names=amenities.index,  # Use column names as labels
    values=amenities.values,  # Use column sums as values
    title="Expenses per Amenities")
pie_expense.update_layout(
    width=1100,  # Set the width to your desired value
    height=400,  # Set the height to your desired value
)

# Show the pie chart
pie_expense.show()

TypeError: histogram() got an unexpected keyword argument 'names'

In [62]:
box_Age = px.box(df_viz, y='Age', color='Deck',
              title="Passenger's age Distribution by Deck")

# Show the plot
box_Age.show()

In [63]:
box_AgeC = px.box(df_viz, y='Age', color='CryoSleep',
              title="Passenger's age Distribution in CryoSleep")

# Show the plot
box_AgeC.show()

In [64]:
box_AgeS = px.box(df_viz, x='Age', orientation='h',
              title="Passenger's age Distribution")
box_AgeS.update_layout(
    width=1100,  # Set the width to your desired value
    height=400,  # Set the height to your desired value
)
# Show the plot
box_AgeS.show()

In [82]:
# Create a count plot of ages using Plotly Express
custom_colors = ['#8C4843','#9CAFB7', '#ED9B40', '#64B6AC']


count_decksHome = px.histogram(df_viz, x='Deck', title='Count Plot per Deck and Home Planet', color='HomePlanet', color_discrete_sequence=custom_colors)
count_decksHome.update_xaxes(title='Deck')
count_decksHome.update_yaxes(title='Count')

# Show the plot
count_decksHome.show()

In [81]:
# Create a count plot of ages using Plotly Express
custom_colors = ['#8C4843', '#ED9B40','#9CAFB7', '#64B6AC']
count_decksDestination = px.histogram(df_viz, x='Deck', title='Count Plot per Deck and Destination', color='Destination',color_discrete_sequence=custom_colors)
count_decksDestination.update_xaxes(title='Deck')
count_decksDestination.update_yaxes(title='Count')

# Show the plot
count_decksDestination.show()

In [67]:
# Create box charts for passenger counts by deck and destination
box_homeAge = px.box(df_viz, x='Deck', y='Age', color='Deck', facet_col='HomePlanet',
              title="Passenger's age Distribution by Deck and Home Planet", facet_col_wrap=1)

# Show the plot
box_homeAge.show()

In [68]:
# Create box charts for passenger counts by deck and destination
box_homeAge = px.box(df, x='Deck', y='Age', color='Deck', facet_col='Destination',
              title="Passenger's age Distribution by Deck and Destination", facet_col_wrap=1)

# Show the plot
box_homeAge.show()

In [69]:
# Create box charts for passenger counts by deck and destination
box_expensesD = px.box(df_viz, x='Deck', y='TotalExpenses', color='Deck',
              title="Passenger's Total Expenses Distribution by Deck")

# Show the plot
box_expensesD.show()

In [70]:
# Create box charts for passenger counts by deck and destination
box_expensesHP = px.box(df_viz, x='HomePlanet', y='TotalExpenses', color='HomePlanet',
              title="Passenger's Total Expenses Distribution by Home Planet")

# Show the plot
box_expensesHP.show()

In [71]:
# Create box charts for passenger counts by deck and destination
box_expensesDest = px.box(df_viz, x='Destination', y='TotalExpenses', color='Destination',
              title="Passenger's Total Expenses Distribution by Destination")

# Show the plot
box_expensesDest.show()

In [72]:
# Create a treemap chart
Cryo = px.treemap(df_viz, path=['CryoSleep', 'HomePlanet'])

# Show the plot
Cryo.show()

In [73]:
#df_model = clean.train_transform(df_viz)

In [74]:
#df_model.head()

In [75]:
#df_model.info()

In [76]:
"""
# Split the data for train-test
X = df_model.drop("Transported", axis=1)
y = df_model["Transported"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
"""

'\n# Split the data for train-test\nX = df_model.drop("Transported", axis=1)\ny = df_model["Transported"]\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)\n'

In [77]:
sbn.heatmap(data = X.corr())

NameError: name 'X' is not defined

In [None]:
"""
# clf = DecisionTreeClassifier()
clf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion='gini')

# Train the classifier on the training data
clf.fit(X_train, y_train)

# Make predictions on the test data
y_pred = clf.predict(X_test)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
"""

'\n# clf = DecisionTreeClassifier()\nclf = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion=\'gini\')\n\n# Train the classifier on the training data\nclf.fit(X_train, y_train)\n\n# Make predictions on the test data\ny_pred = clf.predict(X_test)\n\n# Evaluate the model\'s accuracy\naccuracy = accuracy_score(y_test, y_pred)\nprint(f"Accuracy: {accuracy}")\n'

In [None]:
"""
# Train the classifier on the training data
clf.fit(X, y)

test_sub = extract.get_data("data/test.csv")
test_viz = clean.data_viz(test_sub)
test_model = clean.train_transform(test_viz)

# Make predictions on the test data
y_pred = clf.predict(test_model)
"""

'\n# Train the classifier on the training data\nclf.fit(X, y)\n\ntest_sub = extract.get_data("data/test.csv")\ntest_viz = clean.data_viz(test_sub)\ntest_model = clean.train_transform(test_viz)\n\n# Make predictions on the test data\ny_pred = clf.predict(test_model)\n'

In [None]:
"""
y_pred
test_sub = extract.get_data("data/test.csv")
test_sub['Transported'] = y_pred
my_submission = test_sub[["PassengerId", "Transported"]]
"""

'\ny_pred\ntest_sub = extract.get_data("data/test.csv")\ntest_sub[\'Transported\'] = y_pred\nmy_submission = test_sub[["PassengerId", "Transported"]]\n'