In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from datetime import datetime, timedelta

from category_encoders import MEstimateEncoder

In [2]:
df = pd.read_parquet("feast_demo/feature_repo/data/titanic_train.parquet")

Remove Nan data

In [3]:
df = df.dropna()

In [4]:
df = df.reset_index(drop=True)

In [5]:
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,event_timestamp
0,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2023-03-05 14:51:48.926438
1,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S,2023-03-05 14:51:48.926438
2,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S,2023-03-05 14:51:48.926438
3,11,1,3,"Sandstrom, Miss. Marguerite Rut",female,4.0,1,1,PP 9549,16.7000,G6,S,2023-03-05 14:51:48.926438
4,12,1,1,"Bonnell, Miss. Elizabeth",female,58.0,0,0,113783,26.5500,C103,S,2023-03-05 14:51:48.926438
...,...,...,...,...,...,...,...,...,...,...,...,...,...
178,872,1,1,"Beckwith, Mrs. Richard Leonard (Sallie Monypeny)",female,47.0,1,1,11751,52.5542,D35,S,2023-03-05 14:51:48.926438
179,873,0,1,"Carlsson, Mr. Frans Olof",male,33.0,0,0,695,5.0000,B51 B53 B55,S,2023-03-05 14:51:48.926438
180,880,1,1,"Potter, Mrs. Thomas Jr (Lily Alexenia Wilson)",female,56.0,0,1,11767,83.1583,C50,C,2023-03-05 14:51:48.926438
181,888,1,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S,2023-03-05 14:51:48.926438


pick features for PCA

In [6]:
features = [
    "Pclass",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
]

print("Correlation with Survived:\n")
print(df[features].corrwith(df.Survived))

Correlation with Survived:

Pclass   -0.034542
Age      -0.254085
SibSp     0.106346
Parch     0.023582
Fare      0.134241
dtype: float64


In [7]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    for colname in X.select_dtypes(["category", "object"]):
        print(f"column: {colname}")
        X[colname], _ = X[colname].factorize()

    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    score = cross_val_score(
        model, X, y, cv=5, scoring="neg_mean_squared_log_error",
    )
    score = -1 * score.mean()
    score = np.sqrt(score)
    return score

def apply_pca(X, standardize=True):
    # Standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    # Create principal components
    pca = PCA()
    X_pca = pca.fit_transform(X)
    # Convert to dataframe
    component_names = [f"PC{i+1}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    # Create loadings
    loadings = pd.DataFrame(
        pca.components_.T,  # transpose the matrix of loadings
        columns=component_names,  # so the columns are the principal components
        index=X.columns,  # and the rows are the original features
    )
    return pca, X_pca, loadings

In [8]:
features = [
    "Pclass",
    "Age",
    "SibSp",
    "Parch",
    "Fare",
]

X = df.copy()
y = X.pop("Survived")
X = X.loc[:, features]

X

# `apply_pca`, defined above, reproduces the code from the tutorial
pca, X_pca, loadings = apply_pca(X)
print(loadings)

             PC1       PC2       PC3       PC4       PC5
Pclass -0.142195  0.723847 -0.025308  0.373630  0.561770
Age    -0.313982 -0.597750  0.007917  0.702886  0.223604
SibSp   0.489185 -0.045062 -0.851333  0.182807  0.021947
Parch   0.561872  0.182161  0.417734  0.538595 -0.431893
Fare    0.571134 -0.289010  0.316272 -0.207002  0.668881


In [9]:
X_pca['event_timestamp'] = datetime.now()


In [10]:
X_pca['PassengerId'] = df['PassengerId']

In [11]:
X_pca

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,event_timestamp,PassengerId
0,0.003464,-0.481794,-0.991004,-0.201495,0.050206,2023-03-07 23:36:42.196729,2
1,-0.072348,-0.298333,-1.067847,-0.286986,-0.151978,2023-03-07 23:36:42.196729,4
2,-1.222364,-0.949682,0.258262,0.286256,0.074684,2023-03-07 23:36:42.196729,7
3,0.470111,4.075396,-0.779002,0.583059,0.694532,2023-03-07 23:36:42.196729,11
4,-1.492001,-1.006702,0.155429,0.534607,-0.089904,2023-03-07 23:36:42.196729,12
...,...,...,...,...,...,...,...
178,0.427300,-0.513390,-0.510465,0.967391,-0.557573,2023-03-07 23:36:42.196729,872
179,-1.151445,0.030121,0.053506,-0.530225,-0.636039,2023-03-07 23:36:42.196729,873
180,-0.283813,-0.903174,0.942488,1.004997,-0.194882,2023-03-07 23:36:42.196729,880
181,-0.683439,0.470424,0.149984,-1.227033,-0.617123,2023-03-07 23:36:42.196729,888


In [12]:
X_pca.to_parquet("feast_demo/feature_repo/data/titanic_pca_feature.parquet")

In [13]:
preprocessed_df = df.copy()

columns = [
    "PassengerId",
    "Pclass",
    "Age",
    "Sex",
    "SibSp",
    "Parch",
    "Fare",
    "Survived",
]

preprocessed_df = preprocessed_df.loc[:, columns]

preprocessed_df

Unnamed: 0,PassengerId,Pclass,Age,Sex,SibSp,Parch,Fare,Survived
0,2,1,38.0,female,1,0,71.2833,1
1,4,1,35.0,female,1,0,53.1000,1
2,7,1,54.0,male,0,0,51.8625,0
3,11,3,4.0,female,1,1,16.7000,1
4,12,1,58.0,female,0,0,26.5500,1
...,...,...,...,...,...,...,...,...
178,872,1,47.0,female,1,1,52.5542,1
179,873,1,33.0,male,0,0,5.0000,0
180,880,1,56.0,female,0,1,83.1583,1
181,888,1,19.0,female,0,0,30.0000,1


In [14]:
df['event_timestamp'] = datetime.now()
df.to_parquet("feast_demo/feature_repo/data/titanic_train_preprocessed.parquet")