In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd

In [None]:
df = pd.read_csv("customer_purchases.csv")

df["purchase_date"] = pd.DatetimeIndex(df["purchase_date"])

df = df.sort_values(by='purchase_date', ascending=True)
df.head()

In [None]:
df_train = df[int(df.shape[0]*0.2):]
df_test = df[:int(df.shape[0]*0.2)]
df_new_new = df_test[:int(df_test.shape[0]*0.5)]
df_old_new = df_test[int(df_test.shape[0]*0.5):]

In [None]:
df_train.reset_index().to_csv("train.csv",index=False)
df_test.reset_index().to_csv("test.csv",index=False)
df_new_new.reset_index().to_csv("new_new.csv",index=False)
df_old_new.reset_index().to_csv("old_new.csv",index=False)

In [None]:
df_train.head()

In [52]:
import pandas as pd
import numpy as np

class FeatureProcessor:
    offline_features=[
            "days_since_prev_purchase",
            "prev_purchase_date",
            "prev_purchase_amount",
            "rolling_mean_prev_amount"
        ]

    online_features = [
            'customer_id',
            "purchase_date",
            'age',
            'gender',
            'annual_income',
            'purchase_day',
            'purchase_month',
            'purchase_dayofweek',
            'purchase_hour',
            'is_weekend'
        ]
     
    def process(self, df: pd.DataFrame) -> pd.DataFrame:
        """
        Offline features - using historical data
        """
        # Make sure data is sorted by date
        df = df.sort_values(['customer_id', 'purchase_date'])
        
        # Previous purchase features using historical data
        df['prev_purchase_date'] = df.groupby('customer_id')['purchase_date'].shift(1)
        df['prev_purchase_amount'] = df.groupby('customer_id')['purchase_amount'].shift(1)
        df['days_since_prev_purchase'] = (pd.to_datetime(df['purchase_date']) - pd.to_datetime(df['prev_purchase_date'])).dt.days
        
        # Rolling mean of previous purchases
        df['rolling_mean_prev_amount'] = df.groupby('customer_id')['prev_purchase_amount'].rolling(
            window=3, min_periods=1
        ).mean().reset_index(0, drop=True)
        
        # Fill NaN values
        df['days_since_prev_purchase'] = df['days_since_prev_purchase'].fillna(-1)
        #df['prev_purchase_date'] = df['prev_purchase_date'].fillna(-1)
        df['prev_purchase_amount'] = df['prev_purchase_amount'].fillna(0)
        df['rolling_mean_prev_amount'] = df['rolling_mean_prev_amount'].fillna(0)
        
           # Time-based features from current transaction
        df['purchase_day'] = pd.to_datetime(df['purchase_date']).dt.day
        df['purchase_month'] = pd.to_datetime(df['purchase_date']).dt.month
        df['purchase_dayofweek'] = pd.to_datetime(df['purchase_date']).dt.dayofweek
        df['purchase_hour'] = pd.to_datetime(df['purchase_date']).dt.hour
        df['is_weekend'] = df['purchase_dayofweek'].isin([5,6]).astype(int)
     

       

        return df[self.offline_features+self.online_features]

    # def online_features(self, df: pd.DataFrame) -> pd.DataFrame:
    #     """
    #     Online features - current transaction info only
    #     """
    #     df = df.copy()
        
    #     # Time-based features from current transaction
    #     df['purchase_day'] = pd.to_datetime(df['purchase_date']).dt.day
    #     df['purchase_month'] = pd.to_datetime(df['purchase_date']).dt.month
    #     df['purchase_dayofweek'] = pd.to_datetime(df['purchase_date']).dt.dayofweek
    #     df['purchase_hour'] = pd.to_datetime(df['purchase_date']).dt.hour
    #     df['is_weekend'] = df['purchase_dayofweek'].isin([5,6]).astype(int)
        
    #     # Current transaction features
    #     online_features = [
    #         'customer_id',
    #         "purchase_date",
    #         'age',
    #         'gender',
    #         'annual_income',
    #         'purchase_day',
    #         'purchase_month',
    #         'purchase_dayofweek',
    #         'purchase_hour',
    #         'is_weekend'
    #     ]
        
    #     return df[online_features]


In [53]:
df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,index,customer_id,age,gender,annual_income,purchase_amount,purchase_date
0,8442,1720,34.0,Female,106900.0,312.81,2023-08-06 19:16:58+03:00
1,8201,1672,52.0,Male,77805.0,509.45,2023-08-06 19:16:58+03:00
2,5704,1182,19.0,Female,59325.0,352.38,2023-08-06 19:16:58+03:00
3,5717,1184,20.0,Female,65020.0,205.59,2023-08-06 19:16:58+03:00
4,9476,1935,22.0,Female,69810.0,182.54,2023-08-06 19:16:58+03:00


In [54]:
processor = FeatureProcessor()
train_features = processor.process(df)

# Get offline features (historical)
# offline_df = processor.offline_features(df)
# #print("\nOffline Features (using historical data):")
# #print(offline_df[['customer_id', 'prev_purchase_amount', 'days_since_prev_purchase', 'rolling_mean_prev_amount']].head())

# # Get online features (current only)
# online_df = processor.online_features(df)
# #print("\nOnline Features (current transaction only):")
# #print(online_df.head())

In [55]:
train_features

Unnamed: 0,days_since_prev_purchase,prev_purchase_date,prev_purchase_amount,rolling_mean_prev_amount,customer_id,purchase_date,age,gender,annual_income,purchase_day,purchase_month,purchase_dayofweek,purchase_hour,is_weekend
1871,-1.0,,0.00,0.000000,1,2023-10-12 19:16:58+03:00,40.0,Female,119228.0,12,10,3,19,0
2183,11.0,2023-10-12 19:16:58+03:00,775.89,775.890000,1,2023-10-23 19:16:58+03:00,40.0,Female,119228.0,23,10,0,19,0
2544,13.0,2023-10-23 19:16:58+03:00,1247.51,1011.700000,1,2023-11-05 19:16:58+03:00,40.0,Female,119228.0,5,11,6,19,1
2962,17.0,2023-11-05 19:16:58+03:00,428.70,817.366667,1,2023-11-22 19:16:58+03:00,40.0,Female,119228.0,22,11,2,19,0
3427,18.0,2023-11-22 19:16:58+03:00,986.86,887.690000,1,2023-12-10 19:16:58+03:00,40.0,Female,119228.0,10,12,6,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3037,23.0,2023-11-01 19:16:58+03:00,670.05,525.450000,2000,2023-11-24 19:16:58+03:00,35.0,Female,32476.0,24,11,4,19,0
4595,58.0,2023-11-24 19:16:58+03:00,302.67,483.440000,2000,2024-01-21 19:16:58+03:00,35.0,Female,32476.0,21,1,6,19,1
5248,24.0,2024-01-21 19:16:58+03:00,33.09,335.270000,2000,2024-02-14 19:16:58+03:00,35.0,Female,32476.0,14,2,2,19,0
6520,48.0,2024-02-14 19:16:58+03:00,618.88,318.213333,2000,2024-04-02 19:16:58+03:00,35.0,Female,32476.0,2,4,1,19,0


In [57]:
train_features.to_parquet("train_features.parquet")

In [None]:
from datetime import timedelta
from feast import Entity,Feature,FeatureView,FileSource,ValueType

# Define entity
customer = Entity(
    name="customer",
    value_type=ValueType.INT64,
    description="customer id",
)

# Define data sources
batch_source = FileSource(
    path="data/train_features.parquet",  # Will create this from training data
    timestamp_field="event_timestamp",
)

# Define feature view
customer_features = FeatureView(
    name="customer_features",
    ttl=timedelta(days=1),
    entities=[customer],
    features=[
        Feature(name="prev_purchase_amount", dtype=ValueType.FLOAT),
        Feature(name="avg_purchase_30d", dtype=ValueType.FLOAT),
        Feature(name="purchase_count_30d", dtype=ValueType.INT64),
        Feature(name="days_since_prev_purchase", dtype=ValueType.INT64),
        Feature(name="rolling_mean_amount", dtype=ValueType.FLOAT),
        Feature(name="rolling_std_amount", dtype=ValueType.FLOAT),
    ],
    online=True,
    source=batch_source,
)

In [56]:
train_features['prev_purchase_date'] = pd.DatetimeIndex(train_features['prev_purchase_date'])

In [45]:
processor.online_features+processor.offline_features

['customer_id',
 'purchase_date',
 'age',
 'gender',
 'annual_income',
 'purchase_day',
 'purchase_month',
 'purchase_dayofweek',
 'purchase_hour',
 'is_weekend',
 'days_since_prev_purchase',
 'prev_purchase_date',
 'prev_purchase_amount',
 'rolling_mean_prev_amount']

In [46]:
features

Unnamed: 0,days_since_prev_purchase,prev_purchase_date,prev_purchase_amount,rolling_mean_prev_amount,customer_id,purchase_date,age,gender,annual_income,purchase_day,purchase_month,purchase_dayofweek,purchase_hour,is_weekend
1871,-1.0,-1,0.00,0.000000,1,2023-10-12 19:16:58+03:00,40.0,Female,119228.0,12,10,3,19,0
2183,11.0,2023-10-12 19:16:58+03:00,775.89,775.890000,1,2023-10-23 19:16:58+03:00,40.0,Female,119228.0,23,10,0,19,0
2544,13.0,2023-10-23 19:16:58+03:00,1247.51,1011.700000,1,2023-11-05 19:16:58+03:00,40.0,Female,119228.0,5,11,6,19,1
2962,17.0,2023-11-05 19:16:58+03:00,428.70,817.366667,1,2023-11-22 19:16:58+03:00,40.0,Female,119228.0,22,11,2,19,0
3427,18.0,2023-11-22 19:16:58+03:00,986.86,887.690000,1,2023-12-10 19:16:58+03:00,40.0,Female,119228.0,10,12,6,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3037,23.0,2023-11-01 19:16:58+03:00,670.05,525.450000,2000,2023-11-24 19:16:58+03:00,35.0,Female,32476.0,24,11,4,19,0
4595,58.0,2023-11-24 19:16:58+03:00,302.67,483.440000,2000,2024-01-21 19:16:58+03:00,35.0,Female,32476.0,21,1,6,19,1
5248,24.0,2024-01-21 19:16:58+03:00,33.09,335.270000,2000,2024-02-14 19:16:58+03:00,35.0,Female,32476.0,14,2,2,19,0
6520,48.0,2024-02-14 19:16:58+03:00,618.88,318.213333,2000,2024-04-02 19:16:58+03:00,35.0,Female,32476.0,2,4,1,19,0


In [29]:
online_df.head()

Unnamed: 0,customer_id,purchase_date,age,gender,annual_income,purchase_day,purchase_month,purchase_dayofweek,purchase_hour,is_weekend
0,1720,2023-08-06 19:16:58+03:00,34.0,Female,106900.0,6,8,6,19,1
1,1672,2023-08-06 19:16:58+03:00,52.0,Male,77805.0,6,8,6,19,1
2,1182,2023-08-06 19:16:58+03:00,19.0,Female,59325.0,6,8,6,19,1
3,1184,2023-08-06 19:16:58+03:00,20.0,Female,65020.0,6,8,6,19,1
4,1935,2023-08-06 19:16:58+03:00,22.0,Female,69810.0,6,8,6,19,1


In [30]:
offline_df.head()

Unnamed: 0,customer_id,purchase_date,days_since_prev_purchase,prev_purchase_date,prev_purchase_amount,rolling_mean_prev_amount
1871,1,2023-10-12 19:16:58+03:00,-1.0,-1,0.0,0.0
2183,1,2023-10-23 19:16:58+03:00,11.0,2023-10-12 19:16:58+03:00,775.89,775.89
2544,1,2023-11-05 19:16:58+03:00,13.0,2023-10-23 19:16:58+03:00,1247.51,1011.7
2962,1,2023-11-22 19:16:58+03:00,17.0,2023-11-05 19:16:58+03:00,428.7,817.366667
3427,1,2023-12-10 19:16:58+03:00,18.0,2023-11-22 19:16:58+03:00,986.86,887.69


In [34]:
online_df.shape,offline_df.shape

((7844, 10), (7844, 6))

In [33]:
pd.merge(online_df,offline_df,on=["customer_id","purchase_date"])

Unnamed: 0,customer_id,purchase_date,age,gender,annual_income,purchase_day,purchase_month,purchase_dayofweek,purchase_hour,is_weekend,days_since_prev_purchase,prev_purchase_date,prev_purchase_amount,rolling_mean_prev_amount
0,1720,2023-08-06 19:16:58+03:00,34.0,Female,106900.0,6,8,6,19,1,-1.0,-1,0.00,0.000000
1,1672,2023-08-06 19:16:58+03:00,52.0,Male,77805.0,6,8,6,19,1,-1.0,-1,0.00,0.000000
2,1182,2023-08-06 19:16:58+03:00,19.0,Female,59325.0,6,8,6,19,1,-1.0,-1,0.00,0.000000
3,1184,2023-08-06 19:16:58+03:00,20.0,Female,65020.0,6,8,6,19,1,-1.0,-1,0.00,0.000000
4,1935,2023-08-06 19:16:58+03:00,22.0,Female,69810.0,6,8,6,19,1,-1.0,-1,0.00,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7959,1307,2024-05-22 19:16:58+03:00,25.0,Female,120039.0,22,5,2,19,0,93.0,2024-02-19 19:16:58+03:00,892.05,764.956667
7960,922,2024-05-22 19:16:58+03:00,33.0,Male,79884.0,22,5,2,19,0,61.0,2024-03-22 19:16:58+03:00,1120.60,946.790000
7961,922,2024-05-22 19:16:58+03:00,33.0,Male,79884.0,22,5,2,19,0,0.0,2024-05-22 19:16:58+03:00,354.10,756.413333
7962,1357,2024-05-22 19:16:58+03:00,19.0,Male,39695.0,22,5,2,19,0,20.0,2024-05-02 19:16:58+03:00,367.77,512.740000


In [4]:

if __name__ == "__main__":
    # Load data
    df = pd.read_csv('old_data.csv')
    
    # Initialize processor
    processor = FeatureProcessor()a
    
    # Get offline features (historical)
    offline_df = processor.offline_features(df)
#    print("\nOffline Features (using historical data):")
#    print(offline_df[['customer_id', 'prev_purchase_amount', 'days_since_prev_purchase', 'rolling_mean_prev_amount']].head())
    
    # Get online features (current only)
    online_df = processor.online_features(df)
#    print("\nOnline Features (current transaction only):")
#    print(online_df.head())

SyntaxError: invalid syntax (3131602563.py, line 6)

In [None]:
df_train["prev_purchase_date"] = df_train.groupby("customer_id")["purchase_date"].shift(1)
df_train['prev_purchase_amount'] = df_train.groupby('customer_id')['purchase_amount'].shift(1)
df_train["days_since_prev_purchase"] = (pd.to_datetime(df_train['purchase_date']) - pd.to_datetime(df_train['prev_purchase_date'])).dt.days
df_train["purchase_day"] = df_train.purchase_date.dt.day
df_train["purchase_month"] = df_train.purchase_date.dt.month

df_train['days_since_prev_purchase'] = df_train['days_since_prev_purchase'].fillna(-1)
df_train['prev_purchase_date'] = df_train['prev_purchase_date'].fillna(-1)
df_train['prev_purchase_amount'] = df_train['prev_purchase_amount'].fillna(0)


df_train['rolling_mean_prev_amount'] = df_train.groupby('customer_id')['prev_purchase_amount'].rolling(
            window=3, min_periods=1
        ).mean().reset_index(0, drop=True)

In [None]:
df_train[df_train["customer_id"]==1547]

In [None]:
from feast import FeatureStore

In [None]:
Fea

In [None]:
df_train[df_train["customer_id"]==169]

In [None]:
df = df[:int(df.shape[0]*0.2)]

df_new = df[int(df.shape[0]*0.5):]
df_old = df[:int(df.shape[0]*0.5)]

In [None]:
df_old

In [None]:
df_new.reset_index().to_csv("new_data.csv",index_label=False)
df_old.reset_index().to_csv("old_data.csv",index_label=False)