In [10]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all" 

In [1]:
# read in & view our data set
import pandas as pd
df = pd.read_parquet('titanic.parquet')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,event_timestamp
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,2023-01-01
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,2023-01-01
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,2023-01-01
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,2023-01-01
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,2023-01-01


In [2]:
# do a meta review 
df.info()

# could explore more with 
# ydata-profiling
# df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   PassengerId      891 non-null    int64         
 1   Survived         891 non-null    int64         
 2   Pclass           891 non-null    int64         
 3   Name             891 non-null    object        
 4   Sex              891 non-null    object        
 5   Age              714 non-null    float64       
 6   SibSp            891 non-null    int64         
 7   Parch            891 non-null    int64         
 8   Ticket           891 non-null    object        
 9   Fare             891 non-null    float64       
 10  Cabin            204 non-null    object        
 11  Embarked         889 non-null    object        
 12  event_timestamp  891 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(2), int64(5), object(5)
memory usage: 90.6+ KB


In [3]:
# goal is to predict Survival (1=Yes)
target = 'Survived'
df.value_counts(target)

Survived
0    549
1    342
dtype: int64

In [4]:
# what data elements / features help us predict surival?
#   which elements correlate with Survived?

pd.options.display.float_format = '{:,.2f}'.format
(
    df
    .corr(numeric_only=True)  # correlation
    [[target]]
    .sort_values(target)    
)

# absolute strongest correlations are
# Fare 
# Pclass (negatively)

Unnamed: 0,Survived
Pclass,-0.34
Age,-0.08
SibSp,-0.04
PassengerId,-0.01
Parch,0.08
Fare,0.26
Survived,1.0


In [5]:
from sklearn.linear_model import LogisticRegression

def score(X,y):
    clf = LogisticRegression(random_state=0).fit(X, y)
    return clf.score(X, y)

features = ['Fare','Pclass']
print(f'''using {features} we score {
    score(
        df[features].values 
        ,df[target].values
    ):.2f}''')


using ['Fare', 'Pclass'] we score 0.68


In [6]:
# OR we could grab prepared inputs from a feature store
import feast_repo

# this view contains additional features
    # Gender (one hot encoded)
    # title (like Major, Countess, Col) parsed from Name 
feature_view = 'passenger_stats'


feast_features = [f.name for f in feast_repo.store.get_feature_view(feature_view).features]
print(f'feast_features = {feast_features}')

running  c:\Users\hvu7470\Documents\features\feast_repo\__init__.py
feast_features = ['Fare', 'Pclass', 'encoded_title', 'female', 'male']


In [7]:
df_feast = (
        feast_repo.store
        .get_historical_features(df,[feature_view+':'+f for f in feast_features])
        .to_df()
        [feast_features+[target]]
)
df_feast.sample(5)

Unnamed: 0,Fare,Pclass,encoded_title,female,male,Survived
383,57.0,1,2,0,1,1
249,7.78,3,1,1,0,0
663,29.0,2,0,0,1,1
405,24.15,3,0,0,1,0
492,7.05,3,0,0,1,0


In [13]:
print(f'''previously we scored {
    score(
        df[features].values 
        ,df[target].values
    ):.2f}''')

print(f'''with feature store, we score {
     score(
        df_feast[feast_features].values 
        ,df_feast[target].values
    ):.2f}''')


previously we scored 0.68
with feature store, we score 0.79
