In [1]:
import pandas as pd

In [2]:
# Drop passengerID and Name, as these data may not necessary for the model

def drop(df):
    df.drop(['PassengerId', 'Name'], axis=1, inplace=True)

In [3]:
# insert dataset
train_path = '../spaceship-titanic_rawData/spaceship_train.csv'
test_path = '../spaceship-titanic_rawData/spaceship_test.csv'

# Adjust the chunk size as per your available memory and dataset size
train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

train_data

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8688,9276_01,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,Gravior Noxnuther,False
8689,9278_01,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,Kurta Mondalley,False
8690,9279_01,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,Fayey Connon,True
8691,9280_01,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,Celeon Hontichre,False


In [4]:
# drop
drop(train_data)
drop(test_data)

train_data

Unnamed: 0,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported
0,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False
1,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True
2,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False
3,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False
4,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...
8688,Europa,False,A/98/P,55 Cancri e,41.0,True,0.0,6819.0,0.0,1643.0,74.0,False
8689,Earth,True,G/1499/S,PSO J318.5-22,18.0,False,0.0,0.0,0.0,0.0,0.0,False
8690,Earth,False,G/1500/S,TRAPPIST-1e,26.0,False,0.0,0.0,1872.0,1.0,0.0,True
8691,Europa,False,E/608/S,55 Cancri e,32.0,False,0.0,1049.0,0.0,353.0,3235.0,False


In [5]:
import pandas as pd
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder

# Combine training and test data
combined_data = pd.concat([train_data, test_data], axis=0)

# Separate the features (X) and the target variable (y) from combined data
X = combined_data.drop('Transported', axis=1)
y = combined_data['Transported']

# Identify categorical columns
categorical_cols = ['HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']

# Encode categorical variables using OrdinalEncoder
encoder = OrdinalEncoder()
X[categorical_cols] = encoder.fit_transform(X[categorical_cols])

# Create an instance of IterativeImputer with a random forest classifier as the estimator
estimator = RandomForestClassifier(n_estimators=100, max_depth=10)
imputer = IterativeImputer(estimator=estimator, max_iter=100, tol=1e-4)

In [None]:
# Define the chunk size
chunk_size = 500

# Define a generator function to iterate over data in chunks
def chunk_generator(data, chunk_size):
    total_size = len(data)
    start = 0
    while start < total_size:
        yield data.iloc[start:start+chunk_size]
        start += chunk_size

# Perform imputation in chunks using the generator function
imputed_data = []
for chunk in chunk_generator(X, chunk_size):
    imputed_chunk = imputer.fit_transform(chunk)
    imputed_data.extend(imputed_chunk)

# Convert imputed data back to DataFrame
imputed_df = pd.DataFrame(imputed_data, columns=X.columns)

# Split the imputed data back into training and test sets
train_data = imputed_data[:len(train_data)]
test_data = imputed_data[len(train_data):]

train_data



In [None]:
# export preprocessed data into csv
train_export = '../preprocess_train_dataset/train_data_felipe.csv'
test_export = '../preprocess_test_dataset/test_data_felipe.csv'

train_data.to_csv(train_export, sep=',', encoding='utf-8', index=False)
test_data.to_csv(test_export, sep=',', encoding='utf-8', index=False)