In [12]:
import sys, os

path2add = os.path.normpath(os.path.abspath(os.path.join(os.path.dirname('__file__'), os.path.pardir, 'src')))
if not path2add in sys.path:
    sys.path.append(path2add)

import pandas as pd
import plotly.express as px

In [13]:
    diet_titles = [
        'High-fiber meals', 
        'Cholesterol friendly foods', 
        'Mediterranean diet',
        'Healthy eating guide', 
        'Weight management'
    ]

    physical_activity_titles = [
        'Aerobic exercise',
        'Exercise routines', 
        'Strength training basics', 
        'Cardio workouts'
    ]

    sleep_health_titles = [
        'Restorative sleep tips', 
        'Sleep hygiene'
    ]

    resilience_wellbeing_titles = [
        'Stress reduction',
        'Meditation guide'
    ]

    clinical_titles = [
        'Diabetes management',
        'Hypertension basics',
        'Lowering blood pressure',
        'Cardiometabolic health',
        'HbA1c targets'
    ]

In [58]:
web_visits = pd.read_csv('../data/web_visits.csv')
web_visits

Unnamed: 0,member_id,url,title,description,timestamp
0,1,https://health.wellco/chronic/859,Diabetes management,Blood sugar and glycemic control,2025-07-02 22:38:22
1,1,https://portal.site/tech/328,Gadget roundup,Smartphones and laptops news,2025-07-02 11:30:47
2,1,https://health.wellco/heart/792,Hypertension basics,Blood pressure and lifestyle changes,2025-07-14 00:38:39
3,2,https://example.com/gaming/674,Game reviews,Strategy tips and updates,2025-07-07 02:56:06
4,2,https://living.better/stress/325,Stress reduction,Mindfulness and wellness,2025-07-02 15:53:38
...,...,...,...,...,...
259335,10000,https://example.com/travel/387,Top destinations,City guides and itineraries,2025-07-05 00:05:16
259336,10000,https://guide.wellness/nutrition/261,Cholesterol friendly foods,Lowering LDL and improving lipid profile,2025-07-13 01:15:16
259337,10000,https://media.hub/tech/799,Gadget roundup,Smartphones and laptops news,2025-07-11 01:22:25
259338,10000,https://media.hub/movies/326,New releases,Box office and trailers,2025-07-01 13:55:49


In [None]:
def categorize_web_titles(df: pd.DataFrame, title_column: str = 'title') -> pd.DataFrame:
    """
    Categorizes web visit titles into predefined health categories.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame containing web visit data
    title_column : str
        Name of the column containing titles (default: 'title')
    
    Returns:
    --------
    pd.DataFrame
        Original DataFrame with added 'category' column
    """
    
    # Define category mappings
    categories = {
        'diet': [
            'High-fiber meals', 
            'Cholesterol friendly foods', 
            'Mediterranean diet',
            'Healthy eating guide', 
            'Weight management'
        ],
        'physical_activity': [
            'Aerobic exercise',
            'Exercise routines', 
            'Strength training basics', 
            'Cardio workouts'
        ],
        'sleep': [
            'Restorative sleep tips', 
            'Sleep hygiene'
        ],
        'resilience': [
            'Stress reduction',
            'Meditation guide'
        ],
        'clinical': [
            'Diabetes management',
            'Hypertension basics',
            'Lowering blood pressure',
            'Cardiometabolic health',
            'HbA1c targets'
        ]
    }
    
    # Create a mapping dictionary for faster lookup
    title_to_category = {}
    for category, titles in categories.items():
        for title in titles:
            title_to_category[title] = category
    
    # Map titles to categories
    df = df.copy()
    df['category'] = df[title_column].map(title_to_category).fillna('unrelated')
    
    return df

web_visits_processed = categorize_web_titles(web_visits)
web_visits[web_visits['title'] == 'Cardiometabolic health']

Unnamed: 0,member_id,url,title,description,timestamp
11,2,https://care.portal/hypertension/926,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-03 14:32:58
17,3,https://guide.wellness/hypertension/479,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-13 17:32:01
55,3,https://living.better/hypertension/629,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-09 04:53:50
97,6,https://health.wellco/hypertension/350,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-05 16:26:47
98,6,https://care.portal/hypertension/261,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-14 07:10:10
...,...,...,...,...,...
259237,9996,https://health.wellco/hypertension/521,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-04 07:19:48
259238,9996,https://care.portal/hypertension/505,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-01 09:12:03
259248,9998,https://living.better/hypertension/516,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-11 04:05:15
259296,10000,https://health.wellco/hypertension/567,Lowering blood pressure,Lifestyle changes and medication adherence,2025-07-11 03:58:55


In [16]:
web_visits_pattern = web_visits_processed.drop(columns=['url', 'title', 'description'])
web_visits_pattern

Unnamed: 0,member_id,timestamp,category
0,1,2025-07-02 22:38:22,clinical
1,1,2025-07-02 11:30:47,unrelated
2,1,2025-07-14 00:38:39,clinical
3,2,2025-07-07 02:56:06,unrelated
4,2,2025-07-02 15:53:38,resilience
...,...,...,...
259335,10000,2025-07-05 00:05:16,unrelated
259336,10000,2025-07-13 01:15:16,diet
259337,10000,2025-07-11 01:22:25,unrelated
259338,10000,2025-07-01 13:55:49,unrelated


In [20]:
app_session = pd.read_csv('../data/app_usage.csv').drop(columns=['event_type'])
app_session['category'] = 'app_usage'
app_session

Unnamed: 0,member_id,timestamp,category
0,1,2025-07-13 08:43:37,app_usage
1,1,2025-07-14 15:28:58,app_usage
2,1,2025-07-02 19:43:17,app_usage
3,1,2025-07-09 09:40:14,app_usage
4,1,2025-07-07 00:39:54,app_usage
...,...,...,...
97784,10000,2025-07-01 11:05:24,app_usage
97785,10000,2025-07-09 08:43:18,app_usage
97786,10000,2025-07-05 11:25:41,app_usage
97787,10000,2025-07-04 19:08:59,app_usage


In [None]:
claimes = pd.read_csv('../data/claims.csv')
intrests = ['I10', 'E11.9', 'Z71.3']
claimes = claimes[claimes['icd_code'].isin(intrests)]
# rename icd_code  
claimes.rename(columns={'icd_code': 'category', 'diagnosis_date':'timestamp'}, inplace=True)
claimes

Unnamed: 0,member_id,category,timestamp
0,1,Z71.3,2025-07-09
1,1,Z71.3,2025-07-14
4,1,I10,2025-07-12
8,1,E11.9,2025-07-09
9,2,E11.9,2025-07-08
...,...,...,...
64563,9999,Z71.3,2025-07-12
64564,9999,Z71.3,2025-07-03
64566,9999,Z71.3,2025-07-02
64569,10000,Z71.3,2025-07-07


In [38]:
combined = pd.concat([web_visits_pattern, app_session, claimes], ignore_index=True)
combined = combined.sort_values(by=['member_id', 'timestamp']).reset_index(drop=True)
combined

Unnamed: 0,member_id,timestamp,category
0,1,2025-07-02 11:30:47,unrelated
1,1,2025-07-02 19:43:17,app_usage
2,1,2025-07-02 22:38:22,clinical
3,1,2025-07-03 04:40:05,app_usage
4,1,2025-07-07 00:39:54,app_usage
...,...,...,...
384236,10000,2025-07-14 11:06:23,clinical
384237,10000,2025-07-14 12:07:57,resilience
384238,10000,2025-07-14 13:31:24,app_usage
384239,10000,2025-07-14 15:27:50,app_usage


In [67]:
combined[combined['member_id'] == 45].head(14)

Unnamed: 0,member_id,timestamp,category
1532,45,2025-07-01 04:04:53,app_usage
1533,45,2025-07-01 14:19:06,app_usage
1534,45,2025-07-02 07:04:46,unrelated
1535,45,2025-07-02 23:06:44,app_usage
1536,45,2025-07-03,E11.9
1537,45,2025-07-03 01:12:44,unrelated
1538,45,2025-07-03 05:26:20,app_usage
1539,45,2025-07-03 07:07:09,app_usage
1540,45,2025-07-03 10:05:25,app_usage
1541,45,2025-07-04 03:33:37,diet


In [39]:
from collections import defaultdict

def create_sequences(df, member_col='member_id', time_col='timestamp', event_col='category'):
    """
    Convert tabular event data into sequences for pattern mining.
    
    Parameters:
    -----------
    df : pd.DataFrame
        DataFrame with member_id, timestamp, category (sorted)
    
    Returns:
    --------
    list of lists : Each member's sequence of events
    dict : Mapping of member_id to their sequence
    """
    # Group by member and create sequences
    sequences = []
    member_sequences = {}
    
    for member_id, group in df.groupby(member_col):
        sequence = group[event_col].tolist()
        sequences.append(sequence)
        member_sequences[member_id] = sequence
    
    return sequences, member_sequences

# Example usage
sequences, member_dict = create_sequences(combined)

print("First 5 sequences:")
for i, seq in enumerate(sequences[:5]):
    print(f"Member {i+1}: {seq}")

First 5 sequences:
Member 1: ['unrelated', 'app_usage', 'clinical', 'app_usage', 'app_usage', 'Z71.3', 'E11.9', 'app_usage', 'I10', 'app_usage', 'Z71.3', 'clinical', 'app_usage']
Member 2: ['diet', 'clinical', 'E11.9', 'app_usage', 'sleep', 'resilience', 'app_usage', 'app_usage', 'I10', 'app_usage', 'clinical', 'clinical', 'E11.9', 'app_usage', 'Z71.3', 'Z71.3', 'unrelated', 'physical_activity', 'E11.9', 'app_usage', 'app_usage', 'app_usage', 'physical_activity', 'app_usage', 'I10', 'app_usage']
Member 3: ['sleep', 'app_usage', 'unrelated', 'physical_activity', 'app_usage', 'unrelated', 'unrelated', 'clinical', 'resilience', 'sleep', 'unrelated', 'app_usage', 'unrelated', 'clinical', 'unrelated', 'sleep', 'unrelated', 'unrelated', 'physical_activity', 'unrelated', 'app_usage', 'resilience', 'unrelated', 'unrelated', 'app_usage', 'unrelated', 'unrelated', 'unrelated', 'diet', 'unrelated', 'app_usage', 'unrelated', 'unrelated', 'physical_activity', 'physical_activity', 'clinical', 'app_u

In [45]:
from prefixspan import PrefixSpan

# Prepare sequences (already done above)
sequences, member_dict = create_sequences(combined)

# Initialize PrefixSpan
ps = PrefixSpan(sequences)

# Find frequent patterns
# min_support: minimum number of sequences that must contain the pattern
frequent_patterns = ps.frequent(minsup=10)  # appears in at least 10 members

print("Frequent patterns:")
for support, pattern in frequent_patterns:
    print(f"Support: {support}, Pattern: {pattern}")

# Find top-k most frequent patterns
topk_patterns = ps.topk(k=20)  # top 20 patterns

print("\nTop-K patterns:")
for support, pattern in topk_patterns:
    print(f"Support: {support}, Pattern: {pattern}")

KeyboardInterrupt: 