Feature Store (Python + SQLite)

In [14]:

import sqlite3
import pandas as pd
from typing import List, Optional

DB_PATH = "feature_store.db"


In [8]:

def list_features(db_path: str = DB_PATH):
    """List available features."""
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql("SELECT feature_name, description, version FROM metadata", conn)

def get_features(db_path: str = DB_PATH, customer_ids: List[int] = None, feature_names: List[str] = None):
    """Fetch features for given customer IDs."""
    with sqlite3.connect(db_path) as conn:
        query = "SELECT * FROM features"
        if customer_ids:
            ids = ",".join(map(str, customer_ids))
            query += f" WHERE CustomerId IN ({ids})"
        df = pd.read_sql(query, conn)
        if feature_names:
            cols = ["CustomerId"] + feature_names
            df = df[cols]
        return df

def get_all_for_training(db_path: str = DB_PATH, feature_names: List[str] = None):
    """Retrieve the entire feature matrix for training."""
    with sqlite3.connect(db_path) as conn:
        df = pd.read_sql("SELECT * FROM features", conn)
        if feature_names:
            cols = ["CustomerId"] + feature_names
            df = df[cols]
        return df

def get_feature_metadata(db_path: str = DB_PATH):
    """Retrieve feature metadata."""
    with sqlite3.connect(db_path) as conn:
        return pd.read_sql("SELECT * FROM metadata", conn)


## Feature store initialized with sample data.

In [9]:
def init_feature_store(db_path: str = DB_PATH):
    """Initialize the feature store with sample data."""
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()

        # Create metadata table
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS metadata (
                feature_name TEXT PRIMARY KEY,
                description TEXT,
                version INTEGER
            )
        ''')

        # Create features table (example columns)
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS features (
                CustomerId INTEGER PRIMARY KEY,
                Age_Group TEXT,
                Balance_to_Salary REAL,
                Tenure_Bucket TEXT,
                IsSenior INTEGER
            )
        ''')

        # Sample metadata
        metadata_data = [
            ('Age_Group', 'Categorical age group of the customer', 1),
            ('Balance_to_Salary', 'Ratio of customer balance to estimated salary', 1),
            ('Tenure_Bucket', 'Categorical tenure group of the customer', 1),
            ('IsSenior', 'Whether the customer is a senior citizen (1=Yes, 0=No)', 1),
        ]
        cursor.executemany('INSERT OR IGNORE INTO metadata VALUES (?, ?, ?)', metadata_data)

        # Sample features data (replace with actual data loading)
        # This is just for demonstration purposes
        features_data = [
            (15634602, '40-50', 0.5, '2-5', 0),
            (15737452, '30-40', 1.2, '5-8', 0),
            (15619304, '50+', 0.8, '8+', 1),
             (15701563, '30-40', 0.9, '2-5', 0),
            (15657013, '40-50', 0.6, '5-8', 0),
        ]
        cursor.executemany('INSERT OR IGNORE INTO features VALUES (?, ?, ?, ?, ?)', features_data)

        conn.commit()

# Initialize the feature store
init_feature_store()

print("Feature store initialized with sample data.")

Feature store initialized with sample data.


# List available features

In [15]:


list_features()


Unnamed: 0,feature_name,description,version
0,Age_Group,Categorical age group of the customer,1
1,Balance_to_Salary,Ratio of customer balance to estimated salary,1
2,Tenure_Bucket,Categorical tenure group of the customer,1
3,IsSenior,Whether the customer is a senior citizen (1=Ye...,1


# Inference: fetch features for specific customers

In [11]:
get_features(customer_ids=[15634602, 15737452],
             feature_names=["Age_Group", "Balance_to_Salary", "Tenure_Bucket", "IsSenior"])


Unnamed: 0,CustomerId,Age_Group,Balance_to_Salary,Tenure_Bucket,IsSenior
0,15634602,40-50,0.5,2-5,0
1,15737452,30-40,1.2,5-8,0


# Training: full feature matrix

In [17]:
get_all_for_training(feature_names=["Age_Group", "Balance_to_Salary", "Tenure_Bucket", "IsSenior"]).head()


Unnamed: 0,CustomerId,Age_Group,Balance_to_Salary,Tenure_Bucket,IsSenior
0,15619304,50+,0.8,8+,1
1,15634602,40-50,0.5,2-5,0
2,15657013,40-50,0.6,5-8,0
3,15701563,30-40,0.9,2-5,0
4,15737452,30-40,1.2,5-8,0
