## EDA on Status = Active. Most popular items?

In [18]:
import pandas as pd
tx = pd.read_csv("../data/processed/transactions_clean.csv", low_memory=False)

In [19]:
#filter so we only have status active in tx 
tx = tx[tx['status'] == 'active']

In [20]:
tx

Unnamed: 0,orderId,shopUserId,created,currencyId,orderLineId,sku,sku_family,quantity,price,name,status,groupId,currency_country,sek_rate,price_sek,Age
0,785001,812427,2025-08-05 20:14:28,40,5337669.0,261876-E085,261876,1.0,549.0,Clean Curves Wire bra,active,261873,DK,1.494795,820.642455,
1,784985,831360,2025-08-05 19:55:36,40,5337572.0,261745-0060,261745,4.0,229.0,Trosa Freedom Skin-Relief,active,261745,DK,1.494795,342.308055,60.0
2,784978,209204,2025-08-05 19:47:22,134,5337531.0,265298-5254,265298,1.0,169.0,Bambutrosa 2-pack,active,265298,SE,1.000000,169.000000,
4,784977,831340,2025-08-05 19:46:09,103,5337537.0,267849-D085,267849,1.0,549.0,Bh uten bøyle Stars,active,260596,NO,0.938825,515.414925,
5,784977,831340,2025-08-05 19:46:09,103,5337538.0,262717-4085,262717,1.0,549.0,Bh uten bøyle Stars,active,260596,NO,0.938825,515.414925,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250024,158870,78202,2024-05-22 14:18:16,134,454676.0,220001-0042,220001,1.0,598.0,Jeanskjol,active,221416,SE,1.000000,598.000000,54.0
250026,158841,78181,2024-05-22 13:42:39,103,454625.0,265843-4115,265843,1.0,469.0,Bomulls-bh uten bøyle med Magic Lift-funksjon ...,active,265843,NO,0.938825,440.308925,85.0
250038,158800,78145,2024-05-22 12:54:51,40,454359.0,261518,261518,1.0,49.0,Bh-förlängare 3-pack,active,261518,DK,1.494795,73.244955,
250039,158791,78136,2024-05-22 12:44:01,40,454308.0,542092,542092,1.0,89.0,Tvättlappar enfärgade av frotté 5-pack,active,542087,DK,1.494795,133.036755,34.0


In [21]:
articles = pd.read_csv("../data/processed/articles_clean.csv", dtype={"groupId": str}, low_memory=False)
tx['groupId'] = tx['groupId'].astype(str)
articles['groupId'] = articles['groupId'].astype(str)

most_popular = (
    tx.groupby('groupId', as_index=False)
      .agg(
          count=('groupId', 'size'),
          avg_price_sek=('price_sek', 'mean'),
          avg_age=('Age', 'mean'),
          name=('name', lambda x: x.mode().iat[0] if not x.mode().empty else x.iloc[0])
      )
      .sort_values('count', ascending=False)
      .head(10)
)

most_popular['avg_price_sek'] = most_popular['avg_price_sek'].round().astype(int)
most_popular['avg_age'] = most_popular['avg_age'].round().astype(int)

most_popular = most_popular.merge(
    articles[['groupId', 'name.1']].drop_duplicates('groupId'),
    on='groupId',
    how='left'
)

most_popular

Unnamed: 0,groupId,count,avg_price_sek,avg_age,name,name.1
0,261637,4296,70,77,"Ankelsocka VID""""",Locköstrumpan
1,240187,3359,324,75,Fritidsbukse,Åshild
2,210338,2878,233,77,T-shirt 2-pack,Åshild
3,260646,2762,211,78,Trosa 3-pack,Åshild
4,260596,2525,461,71,Bh utan bygel Stars,Swegmark
5,241562,2372,548,76,Velourbyxa,Åshild
6,260513,2246,511,74,Bh utan bygel med Magic Lift-funktion,Glamorise
7,218982,1921,106,75,T-shirt,Åshild
8,260695,1713,163,75,Seamless bh-topp,Louise
9,210186,1712,218,76,Polojumper,Åshild


## Items bought together? 
### First check on the popular item 260646

In [23]:
orders_with_260646 = tx.loc[tx['groupId'] == '260646', 'orderId'].unique()
tx_260646_orders = tx[tx['orderId'].isin(orders_with_260646)]

purchased_together_df = (
    tx_260646_orders.groupby('orderId')
    .agg(prc_tog_groupIds=('groupId', lambda x: [gid for gid in x.unique() if gid != '260646']),
         item_count=('groupId', 'size'))
    .query('item_count > 5')
    .drop(columns='item_count')
    .reset_index()
)
purchased_together_df['src_groupId'] = '260646'
purchased_together_df = purchased_together_df[['orderId', 'src_groupId', 'prc_tog_groupIds']]
purchased_together_df

Unnamed: 0,orderId,src_groupId,prc_tog_groupIds
0,160420,260646,"[261637, 218982, 261012, 264234, 264242, 26022..."
1,169214,260646,"[261663, 241687, 264804, 260380]"
2,170687,260646,"[261610, 266494, 261192, 265249, 242198, 26163..."
3,174870,260646,"[260572, 546112, 539164, 525035, 290183, 50831..."
4,175262,260646,"[218982, 261475, 260922]"
...,...,...,...
102,741266,260646,"[261637, 261475, 242024, 241687, 210756, 210338]"
103,744121,260646,"[261618, 264275, 260463, 265298]"
104,751207,260646,"[266643, 260173, 210338, 264242]"
105,755239,260646,"[260463, 241091, 261595, 260232, 265041, 260951]"


### Count how many times was that other item purchased together and see the top 10

In [24]:
# Use groupId instead of sku_family
top10_co_purchased = (
    tx[(tx['orderId'].isin(tx[tx['groupId'] == '260646']['orderId'])) & (tx['groupId'] != '260646')]
    .groupby(['groupId', 'name'])
    .size()
    .reset_index(name='count')
    .sort_values('count', ascending=False)
    .head(10)
)
top10_co_purchased


Unnamed: 0,groupId,name,count
324,261637,"Ankelsocka VID""""",178
194,260695,Seamless bh-topp,113
73,240187,Fritidsbyxa,104
326,261637,"Ankelsokk VID""""",72
281,261427,Trosa 3-pack vita,53
24,210338,T-shirt 2-pack,49
102,242024,Leggings,47
458,264275,Trosa 3-pack,45
287,261475,Benkläder,40
459,264275,Truse 3-pk,39


In [25]:
import json

# For each groupId, find as max top 10 most frequently co-purchased groupIds (with counts)
rows = []
for src_group in tx['groupId'].unique():
    orders = tx.loc[tx['groupId'] == src_group, 'orderId'].unique()
    co = (
        tx[tx['orderId'].isin(orders) & (tx['groupId'] != src_group)]
        .groupby('groupId').size().reset_index(name='count')
        .sort_values('count', ascending=False).head(10)
    )
    freq_list = [{"groupId": str(row['groupId']), "count": int(row['count'])} for _, row in co.iterrows()]
    rows.append({
        "src_groupId": str(src_group),
        "freq_cooccur_groupIds": json.dumps(freq_list, ensure_ascii=False)
    })

freq_cooccur_df = pd.DataFrame(rows)
freq_cooccur_df

Unnamed: 0,src_groupId,freq_cooccur_groupIds
0,261873,"[{""groupId"": ""260596"", ""count"": 5}, {""groupId""..."
1,261745,"[{""groupId"": ""261253"", ""count"": 5}, {""groupId""..."
2,265298,"[{""groupId"": ""261637"", ""count"": 121}, {""groupI..."
3,260596,"[{""groupId"": ""261637"", ""count"": 65}, {""groupId..."
4,260951,"[{""groupId"": ""260620"", ""count"": 38}, {""groupId..."
...,...,...
1118,341155,"[{""groupId"": ""430221"", ""count"": 1}, {""groupId""..."
1119,270518,"[{""groupId"": ""260232"", ""count"": 1}, {""groupId""..."
1120,260290,[]
1121,260993,[]


In [26]:
# Count how many src_groupId have less than 10 items in freq_cooccur_groupIds
count_less_than_10 = (freq_cooccur_df['freq_cooccur_groupIds']
                      .apply(lambda x: len(json.loads(x))) < 10).sum()
print(count_less_than_10)


429


## Limit so that we include the products that were copurchased at least 10 times with the src_sku_family


In [29]:
# For each src_groupId, get a list of co-purchased groupIds (not including itself) that were co-purchased at least 10 times
recs = []
for src_group in tx['groupId'].unique():
    orders = tx.loc[tx['groupId'] == src_group, 'orderId'].unique()
    co = (
        tx[tx['orderId'].isin(orders) & (tx['groupId'] != src_group)]
        .groupby('groupId').size().reset_index(name='count')
    )
    co_10plus = co[co['count'] >= 10]['groupId'].astype(str).tolist()
    recs.append({
        "src_groupId": str(src_group),
        "recs": co_10plus
    })

copurchased_10plus_df = pd.DataFrame(recs)
display(copurchased_10plus_df)


Unnamed: 0,src_groupId,recs
0,261873,[]
1,261745,[]
2,265298,"[200304, 200400, 210186, 210338, 210695, 21075..."
3,260596,"[200304, 210186, 210338, 210695, 210756, 21898..."
4,260951,"[200304, 200400, 210338, 218982, 240184, 24018..."
...,...,...
1118,341155,[]
1119,270518,[]
1120,260290,[]
1121,260993,[]


In [30]:

# Count how many rows have completely empty lists of recs
num_empty_recs = copurchased_10plus_df['recs'].apply(lambda x: isinstance(x, list) and len(x) == 0).sum()
print(f"Number of completely empty recs: {num_empty_recs}")

# Count how many rows have incomplete recs (less than 10 items, but not empty)
num_incomplete_recs = copurchased_10plus_df['recs'].apply(lambda x: isinstance(x, list) and 0 < len(x) < 10).sum()
print(f"Number of incomplete recs (<10 items): {num_incomplete_recs}")

Number of completely empty recs: 857
Number of incomplete recs (<10 items): 149


## Now we have a lot of empty and incomplete recs lists, we can fill those up with data from vector simillarity

In [41]:
# There are many src_groupIds in copurchased_10plus_df with completely empty recs,
# and some with incomplete lists (<10). We fill all to exactly 10 using vector similarity recs as needed.
# If there are more than 10, we trim to 10.

import json
import ast

# Load vector similarity recommendations
vecsim_df = pd.read_csv("../data/predictions/vector_similarity_recommendations.csv")

# Parse the vector similarity recs (stringified lists) into lists
def parse_recs(x):
    try:
        if isinstance(x, str):
            return ast.literal_eval(x)
        return []
    except Exception:
        return []

# Build a mapping from src_groupId to its vector recs (as list of str)
vecsim_map = dict(
    zip(
        vecsim_df['src_groupId'].astype(str),
        vecsim_df['recs'].apply(parse_recs)
    )
)

recs_rows = []
for idx, row in copurchased_10plus_df.iterrows():
    src_group = str(row['src_groupId'])
    # CF recs: list of str, or empty list
    cf_recs = row['recs'] if isinstance(row['recs'], list) else []
    cf_recs = [str(x) for x in cf_recs if str(x) != ""]
    recs_annotated = []

    # Vector recs: list of str, or empty list
    vecsim_recs = vecsim_map.get(src_group, [])
    vecsim_recs = [str(x) for x in vecsim_recs if str(x) != ""]

    # Add all CF recs first, annotated
    for group in cf_recs:
        recs_annotated.append({"groupId": group, "source": "cf"})

    # Fill up to 10 with vector recs, skipping any already in CF
    for group in vecsim_recs:
        if group not in cf_recs and len(recs_annotated) < 10:
            recs_annotated.append({"groupId": group, "source": "vector"})
        if len(recs_annotated) >= 10:
            break

    # If still less than 10, pad with additional vector recs (even if duplicates, but try to avoid)
    if len(recs_annotated) < 10:
        # Try to fill with more vector recs, even if already in cf_recs, but not already in recs_annotated
        all_added = set([r["groupId"] for r in recs_annotated])
        for group in vecsim_recs:
            if group not in all_added:
                recs_annotated.append({"groupId": group, "source": "vector"})
                all_added.add(group)
            if len(recs_annotated) >= 10:
                break
        # If still not enough, just repeat the last one (or fill with None)
        while len(recs_annotated) < 10:
            recs_annotated.append({"groupId": None, "source": "vector"})

    # If more than 10, trim to 10
    recs_annotated = recs_annotated[:10]

    recs_rows.append({
        "src_groupId": src_group,
        "recs": json.dumps(recs_annotated, ensure_ascii=False)
    })

recs_df = pd.DataFrame(recs_rows)

# Show the result, wide columns
with pd.option_context('display.max_colwidth', None, 'display.width', 2000):
    display(recs_df)

Unnamed: 0,src_groupId,recs
0,261873,"[{""groupId"": ""261463"", ""source"": ""vector""}, {""groupId"": ""261626"", ""source"": ""vector""}, {""groupId"": ""261591"", ""source"": ""vector""}, {""groupId"": ""175701"", ""source"": ""vector""}, {""groupId"": ""261574"", ""source"": ""vector""}, {""groupId"": ""261585"", ""source"": ""vector""}, {""groupId"": ""263000"", ""source"": ""vector""}, {""groupId"": ""261567"", ""source"": ""vector""}, {""groupId"": ""267097"", ""source"": ""vector""}, {""groupId"": ""146601"", ""source"": ""vector""}]"
1,261745,"[{""groupId"": ""267108"", ""source"": ""vector""}, {""groupId"": ""267131"", ""source"": ""vector""}, {""groupId"": ""260922"", ""source"": ""vector""}, {""groupId"": ""261938"", ""source"": ""vector""}, {""groupId"": ""261940"", ""source"": ""vector""}, {""groupId"": ""260507"", ""source"": ""vector""}, {""groupId"": ""261523"", ""source"": ""vector""}, {""groupId"": ""261379"", ""source"": ""vector""}, {""groupId"": ""262016"", ""source"": ""vector""}, {""groupId"": ""262034"", ""source"": ""vector""}]"
2,265298,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240144"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}]"
3,260596,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""221416"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""240276"", ""source"": ""cf""}]"
4,260951,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""240276"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}, {""groupId"": ""241687"", ""source"": ""cf""}, {""groupId"": ""260313"", ""source"": ""cf""}]"
...,...,...
1118,341155,"[{""groupId"": ""432058"", ""source"": ""vector""}, {""groupId"": ""432057"", ""source"": ""vector""}, {""groupId"": ""432061"", ""source"": ""vector""}, {""groupId"": ""341151"", ""source"": ""vector""}, {""groupId"": ""341140"", ""source"": ""vector""}, {""groupId"": ""341173"", ""source"": ""vector""}, {""groupId"": ""410827"", ""source"": ""vector""}, {""groupId"": ""432036"", ""source"": ""vector""}, {""groupId"": ""432065"", ""source"": ""vector""}, {""groupId"": ""350519"", ""source"": ""vector""}]"
1119,270518,"[{""groupId"": ""270122"", ""source"": ""vector""}, {""groupId"": ""270204"", ""source"": ""vector""}, {""groupId"": ""270124"", ""source"": ""vector""}, {""groupId"": ""270519"", ""source"": ""vector""}, {""groupId"": ""270106"", ""source"": ""vector""}, {""groupId"": ""270505"", ""source"": ""vector""}, {""groupId"": ""261851"", ""source"": ""vector""}, {""groupId"": ""270112"", ""source"": ""vector""}, {""groupId"": ""270201"", ""source"": ""vector""}, {""groupId"": ""270103"", ""source"": ""vector""}]"
1120,260290,"[{""groupId"": ""260237"", ""source"": ""vector""}, {""groupId"": ""260271"", ""source"": ""vector""}, {""groupId"": ""261806"", ""source"": ""vector""}, {""groupId"": ""260268"", ""source"": ""vector""}, {""groupId"": ""261817"", ""source"": ""vector""}, {""groupId"": ""261803"", ""source"": ""vector""}, {""groupId"": ""261772"", ""source"": ""vector""}, {""groupId"": ""261838"", ""source"": ""vector""}, {""groupId"": ""270607"", ""source"": ""vector""}, {""groupId"": ""261822"", ""source"": ""vector""}]"
1121,260993,"[{""groupId"": ""261187"", ""source"": ""vector""}, {""groupId"": ""260596"", ""source"": ""vector""}, {""groupId"": ""260926"", ""source"": ""vector""}, {""groupId"": ""261597"", ""source"": ""vector""}, {""groupId"": ""260219"", ""source"": ""vector""}, {""groupId"": ""263277"", ""source"": ""vector""}, {""groupId"": ""260994"", ""source"": ""vector""}, {""groupId"": ""266825"", ""source"": ""vector""}, {""groupId"": ""261583"", ""source"": ""vector""}, {""groupId"": ""261470"", ""source"": ""vector""}]"


In [42]:
# Show rows where any groupId in recs is None
def has_null_groupid(rec_json):
    try:
        recs = json.loads(rec_json)
        if isinstance(recs, list):
            for r in recs:
                if isinstance(r, dict) and r.get("groupId") is None:
                    return True
        return False
    except Exception:
        return False

null_groupid_df = recs_df[recs_df['recs'].apply(has_null_groupid)]

with pd.option_context('display.max_colwidth', None, 'display.width', 2000):
    display(null_groupid_df)


Unnamed: 0,src_groupId,recs
156,12025SE,"[{""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}, {""groupId"": null, ""source"": ""vector""}]"


In [43]:
# Remove the row where src_groupId is '12025SE'
recs_df = recs_df[recs_df['src_groupId'] != '12025SE']


In [39]:
# Find rows where both CF and Vector recommendations are present
# Find rows where both CF and Vector recommendations are present in the recs list
def has_both_cf_and_vector(rec_json):
    try:
        recs = json.loads(rec_json)
        if isinstance(recs, list):
            sources = set()
            for r in recs:
                if isinstance(r, dict) and "source" in r:
                    sources.add(r["source"])
            return "cf" in sources and "vector" in sources
        return False
    except Exception:
        return False

both_cf_and_vector_df = recs_df[recs_df['recs'].apply(has_both_cf_and_vector)]

with pd.option_context('display.max_colwidth', None, 'display.width', 2000):
    display(both_cf_and_vector_df)


Unnamed: 0,src_groupId,recs
6,260141,"[{""groupId"": ""265298"", ""source"": ""cf""}, {""groupId"": ""260893"", ""source"": ""vector""}, {""groupId"": ""260158"", ""source"": ""vector""}, {""groupId"": ""260174"", ""source"": ""vector""}, {""groupId"": ""260018"", ""source"": ""vector""}, {""groupId"": ""262287"", ""source"": ""vector""}, {""groupId"": ""293068"", ""source"": ""vector""}, {""groupId"": ""260240"", ""source"": ""vector""}, {""groupId"": ""291278"", ""source"": ""vector""}, {""groupId"": ""291294"", ""source"": ""vector""}]"
17,290134,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""292052"", ""source"": ""vector""}, {""groupId"": ""293167"", ""source"": ""vector""}, {""groupId"": ""294082"", ""source"": ""vector""}, {""groupId"": ""293076"", ""source"": ""vector""}, {""groupId"": ""293910"", ""source"": ""vector""}, {""groupId"": ""293639"", ""source"": ""vector""}, {""groupId"": ""290255"", ""source"": ""vector""}, {""groupId"": ""290209"", ""source"": ""vector""}, {""groupId"": ""262428"", ""source"": ""vector""}]"
22,260484,"[{""groupId"": ""260223"", ""source"": ""cf""}, {""groupId"": ""260646"", ""source"": ""cf""}, {""groupId"": ""264275"", ""source"": ""cf""}, {""groupId"": ""260551"", ""source"": ""vector""}, {""groupId"": ""261490"", ""source"": ""vector""}, {""groupId"": ""260234"", ""source"": ""vector""}, {""groupId"": ""261813"", ""source"": ""vector""}, {""groupId"": ""260236"", ""source"": ""vector""}, {""groupId"": ""260235"", ""source"": ""vector""}, {""groupId"": ""261798"", ""source"": ""vector""}]"
27,290183,"[{""groupId"": ""261637"", ""source"": ""cf""}, {""groupId"": ""342760"", ""source"": ""vector""}, {""groupId"": ""292771"", ""source"": ""vector""}, {""groupId"": ""293118"", ""source"": ""vector""}, {""groupId"": ""292219"", ""source"": ""vector""}, {""groupId"": ""293415"", ""source"": ""vector""}, {""groupId"": ""470021"", ""source"": ""vector""}, {""groupId"": ""445897"", ""source"": ""vector""}, {""groupId"": ""290181"", ""source"": ""vector""}, {""groupId"": ""291831"", ""source"": ""vector""}]"
30,261916,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""241562"", ""source"": ""cf""}, {""groupId"": ""260646"", ""source"": ""cf""}, {""groupId"": ""260931"", ""source"": ""cf""}, {""groupId"": ""261436"", ""source"": ""cf""}, {""groupId"": ""261637"", ""source"": ""cf""}, {""groupId"": ""261699"", ""source"": ""cf""}, {""groupId"": ""261920"", ""source"": ""cf""}, {""groupId"": ""261924"", ""source"": ""cf""}, {""groupId"": ""260564"", ""source"": ""vector""}]"
...,...,...
814,503392,"[{""groupId"": ""503380"", ""source"": ""cf""}, {""groupId"": ""503397"", ""source"": ""vector""}, {""groupId"": ""503373"", ""source"": ""vector""}, {""groupId"": ""512368"", ""source"": ""vector""}, {""groupId"": ""512365"", ""source"": ""vector""}, {""groupId"": ""507871"", ""source"": ""vector""}, {""groupId"": ""537323"", ""source"": ""vector""}, {""groupId"": ""503386"", ""source"": ""vector""}, {""groupId"": ""507707"", ""source"": ""vector""}, {""groupId"": ""500355"", ""source"": ""vector""}]"
822,507707,"[{""groupId"": ""503380"", ""source"": ""cf""}, {""groupId"": ""507871"", ""source"": ""cf""}, {""groupId"": ""503386"", ""source"": ""vector""}, {""groupId"": ""579009"", ""source"": ""vector""}, {""groupId"": ""190041"", ""source"": ""vector""}, {""groupId"": ""525937"", ""source"": ""vector""}, {""groupId"": ""576223"", ""source"": ""vector""}, {""groupId"": ""521879"", ""source"": ""vector""}, {""groupId"": ""549005"", ""source"": ""vector""}, {""groupId"": ""537323"", ""source"": ""vector""}]"
867,546181,"[{""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""260646"", ""source"": ""cf""}, {""groupId"": ""431017"", ""source"": ""cf""}, {""groupId"": ""546180"", ""source"": ""vector""}, {""groupId"": ""541423"", ""source"": ""vector""}, {""groupId"": ""541437"", ""source"": ""vector""}, {""groupId"": ""541420"", ""source"": ""vector""}, {""groupId"": ""541435"", ""source"": ""vector""}, {""groupId"": ""541428"", ""source"": ""vector""}, {""groupId"": ""541419"", ""source"": ""vector""}]"
942,530341,"[{""groupId"": ""530330"", ""source"": ""cf""}, {""groupId"": ""530335"", ""source"": ""cf""}, {""groupId"": ""544920"", ""source"": ""vector""}, {""groupId"": ""530347"", ""source"": ""vector""}, {""groupId"": ""544482"", ""source"": ""vector""}, {""groupId"": ""544976"", ""source"": ""vector""}, {""groupId"": ""549502"", ""source"": ""vector""}, {""groupId"": ""551193"", ""source"": ""vector""}, {""groupId"": ""530292"", ""source"": ""vector""}, {""groupId"": ""530507"", ""source"": ""vector""}]"


In [44]:
# Find rows where all recommendations are from 'cf' source
def all_recs_are_cf(rec_json):
    try:
        recs = json.loads(rec_json)
        if isinstance(recs, list) and recs:
            return all(isinstance(r, dict) and r.get("source") == "cf" for r in recs)
        return False
    except Exception:
        return False

all_cf_df = recs_df[recs_df['recs'].apply(all_recs_are_cf)]

with pd.option_context('display.max_colwidth', None, 'display.width', 2000):
    display(all_cf_df)


Unnamed: 0,src_groupId,recs
2,265298,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240144"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}]"
3,260596,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""221416"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""240276"", ""source"": ""cf""}]"
4,260951,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""240276"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}, {""groupId"": ""241687"", ""source"": ""cf""}, {""groupId"": ""260313"", ""source"": ""cf""}]"
7,260513,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}]"
8,265249,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""240276"", ""source"": ""cf""}, {""groupId"": ""241562"", ""source"": ""cf""}]"
...,...,...
321,241653,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""210756"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240012"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}, {""groupId"": ""241562"", ""source"": ""cf""}, {""groupId"": ""241687"", ""source"": ""cf""}]"
353,261012,"[{""groupId"": ""200258"", ""source"": ""cf""}, {""groupId"": ""200400"", ""source"": ""cf""}, {""groupId"": ""210186"", ""source"": ""cf""}, {""groupId"": ""210695"", ""source"": ""cf""}, {""groupId"": ""210726"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""241562"", ""source"": ""cf""}, {""groupId"": ""242289"", ""source"": ""cf""}, {""groupId"": ""260557"", ""source"": ""cf""}]"
358,281410,"[{""groupId"": ""200304"", ""source"": ""cf""}, {""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}, {""groupId"": ""241562"", ""source"": ""cf""}, {""groupId"": ""241687"", ""source"": ""cf""}, {""groupId"": ""260695"", ""source"": ""cf""}, {""groupId"": ""261595"", ""source"": ""cf""}, {""groupId"": ""261610"", ""source"": ""cf""}]"
362,240012,"[{""groupId"": ""210338"", ""source"": ""cf""}, {""groupId"": ""218982"", ""source"": ""cf""}, {""groupId"": ""240184"", ""source"": ""cf""}, {""groupId"": ""240187"", ""source"": ""cf""}, {""groupId"": ""241091"", ""source"": ""cf""}, {""groupId"": ""241653"", ""source"": ""cf""}, {""groupId"": ""241687"", ""source"": ""cf""}, {""groupId"": ""242024"", ""source"": ""cf""}, {""groupId"": ""260596"", ""source"": ""cf""}, {""groupId"": ""261637"", ""source"": ""cf""}]"


In [46]:
# Save the recs_df DataFrame to a CSV file in the predictions directory
recs_df.to_csv("../data/predictions/cf_and_vector_recs.csv", index=False)
