In [None]:
import numpy as np
import pandas as pd


In [None]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df_raw = pd.read_parquet(root + '/data/train.parquet')
unused = ['status', 'firstName', 'lastName', 'ts', 'auth', 'userAgent']
df_raw.drop(columns=unused, inplace=True)
df_raw.head()

In [None]:
root = '/Users/mdiaspinto/Documents/School/Python Data Science/Final Project/kaggle-churn'
df_test = pd.read_parquet(root + '/data/test.parquet')
df_test.drop(columns=unused, inplace=True)
df_test.head()

In [None]:
def feature_builder(df: pd.DataFrame, cutoff_date: pd.Timestamp) -> pd.DataFrame:
    
    # Create a slice of the dataframe up to the cutoff date and makes userId the index
    df_slice = df[df['time'] < cutoff_date].copy()
    idx = pd.Index(np.sort(df['userId'].unique()), name='userId')
    final_df = pd.DataFrame(index=idx)

    # Get key features from users at cutoff date
    user_group = df.groupby('userId')
    final_df['level'] = user_group["level"].last().reindex(idx)
    final_df['days_registered'] = cutoff_date - user_group["registration"].min().reindex(idx).dt.normalize()

    # Group sessions and defines start and end for each one
    session_group = df_slice.groupby(['userId', 'sessionId']).agg(
        session_start=('time', 'min'),
        session_end=('time', 'max'),
    )

    # Calculate session length in seconds
    session_group["session_length"] = (
    session_group["session_end"] - session_group["session_start"]
    ).dt.total_seconds()
    
    # Aggregate session statistics per user
    session_stats = session_group.groupby('userId').agg(
        num_sessions=('session_start', 'count'),
        avg_session_length=('session_length', 'mean'),
        days_since_last_session=('session_end', lambda x: (cutoff_date - x.max()).days),
    )

    

    return final_df

In [35]:
test = feature_builder(df_raw, pd.Timestamp("2018-10-15"))

In [36]:
test.head(10)

Unnamed: 0_level_0,level,days_registered
userId,Unnamed: 1_level_1,Unnamed: 2_level_1
1000025,paid,97 days
1000035,paid,33 days
1000083,paid,38 days
1000103,paid,23 days
1000164,paid,64 days
1000168,paid,68 days
1000182,paid,104 days
1000194,free,35 days
1000214,free,27 days
1000233,paid,17 days
