In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

pd.set_option('display.max_columns', None)

In [3]:
train = catalog.load('train')
parameters = catalog.load('parameters')

ID_COL = parameters['col_maps']['ID_COL']
TARGET_COL = parameters['col_maps']['TARGET_COL']

# Functions

## Features functions

In [4]:
def gen_counts_per_cat_col(df, cat_col, feature_name):
    
    
    df_count = (
        df
        .groupby(cat_col)
        .size()
        .reset_index(name=feature_name)
    )
    
    df = df.merge(df_count, on=[cat_col], how="left")
    
    return df

def create_cat_features(input_df):
    
    cat_counts_params = {
        "Cabin_Num":{
            "Feature_Name": "People_in_Cabin_Num"
        },
        "Cabin_Deck":{
            "Feature_Name": "People_in_Cabin_Deck"
        },
        "Nickname":{
            "Feature_Name": "Family_Size"
        }
    }
    
    df = input_df.copy()
    for col in list(cat_counts_params.keys()):
        
        df = gen_counts_per_cat_col(
            df,
            col,
            cat_counts_params[col]["Feature_Name"]
        )
    
    return df

## Node Functions

In [5]:
parameters

{'model_options': {'test_size': 0.2, 'random_state': 3},
 'col_maps': {'TARGET_COL': 'Transported', 'ID_COL': 'PassengerId'},
 'col_selection': {'rm_categorical_cols': ['Cabin',
   'Cabin_Num',
   'Name',
   'Nickname'],
  'rm_numerical_cols': []}}

In [6]:
def preprocess_data(input_data, parameters):
    processed_data = input_data.copy()
    processed_data[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = (
        processed_data["Cabin"]
        .str.split('/', expand=True)
    )
    processed_data["Nickname"] = processed_data["Name"].str.split(" ").str[1]
    
    
    processed_data = create_cat_features(processed_data)
    
    TARGET_COL = parameters["col_maps"]["TARGET_COL"]
    processed_data[TARGET_COL] = np.where(
        processed_data[TARGET_COL],
        1,
        0
    )
    return processed_data

## Main Pipeline Function

In [7]:
def create_training_data(train, parameters):
    pass

# Exploring

In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


In [45]:
#processed_data = preprocess_data(train, parameters)

In [9]:
from spaceship_titanic.pipelines.preprocessing.utils import create_cat_features, select_cols

In [12]:
processed_data = train.copy()
processed_data[["Cabin_Deck", "Cabin_Num", "Cabin_Side"]] = (
    processed_data["Cabin"]
    .str.split('/', expand=True)
)
processed_data["Nickname"] = processed_data["Name"].str.split(" ").str[1]

processed_data = create_cat_features(processed_data)

TARGET_COL = parameters["col_maps"]["TARGET_COL"]
processed_data[TARGET_COL] = np.where(
    processed_data[TARGET_COL],
    1,
    0
)

In [13]:
processed_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported,Cabin_Deck,Cabin_Num,Cabin_Side,Nickname,People_in_Cabin_Num,People_in_Cabin_Deck,Family_Size
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,0,B,0,P,Ofracculy,18.0,779.0,1.0
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,1,F,0,S,Vines,18.0,2794.0,4.0
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,0,A,0,S,Susent,18.0,256.0,6.0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,0,A,0,S,Susent,18.0,256.0,6.0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,1,F,1,S,Santantines,15.0,2794.0,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,0,A,98,P,Noxnuther,11.0,256.0,3.0
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,0,G,1499,S,Mondalley,2.0,2559.0,2.0
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,1,G,1500,S,Connon,3.0,2559.0,6.0
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,0,E,608,S,Hontichre,3.0,876.0,6.0


In [10]:
processed_data = select_cols(processed_data, parameters)

In [11]:
processed_data