In [1]:
import pandas as pd
import pprint
import re

In [8]:
df = pd.read_csv('website_structure.csv', encoding='utf-8') # or 'utf-8-sig'
print(df.head())

   record_index                                          website  \
0             1  https://baizhankyzy.github.io/female-directors/   
1             1  https://baizhankyzy.github.io/female-directors/   
2             1  https://baizhankyzy.github.io/female-directors/   
3             1  https://baizhankyzy.github.io/female-directors/   
4             1  https://baizhankyzy.github.io/female-directors/   

            type               role chart_type chart_design_feature  \
0           text      project title        NaN                  NaN   
1           text       introduction        NaN                  NaN   
2           text  research question        NaN                  NaN   
3           text   data preparation        NaN                  NaN   
4  chart section                NaN       line                  NaN   

  chart_interaction_feature  \
0                       NaN   
1                       NaN   
2                       NaN   
3                       NaN   
4        

In [9]:
features = set()
for feature_list in df['chart_interaction_feature']:
    if pd.notna(feature_list):
        for feature in feature_list.split(';'):
            features.add(feature.strip())
print(len(features), "unique features found:")
pprint.pp(features)

18 unique features found:
{'animation',
 'broken-y',
 'carousel',
 'click',
 'filters',
 'find-more',
 'flip',
 'hover',
 'link',
 'metadata',
 'on-demand',
 'pagination',
 'quiz',
 'scrollytelling',
 'search',
 'time-animation',
 'view-change',
 'zoom'}


In [10]:
# --- Config ------------------------------------------------------------------
ALIASES = {
    "find-more": "search",
    "find_more": "search",
    "filter": "filters",           # normalize singular â†’ plural
    "time_animation": "time-animation",
}

FAMILIES = {
    "basic": {"click", "hover", "link", "flip"},
    "exploration": {"filters", "search", "zoom"},
    "navigation": {"pagination", "view-change"},
    "analytical": {"broken-y"},
    "on_demand": {"metadata", "on-demand"},
    "storytelling": {"scrollytelling", "animation", "time-animation", "carousel", "quiz"},
}

FAMILY_WEIGHT = {
    "basic": 1,
    "exploration": 2,
    "navigation": 2,
    "analytical": 2,
    "on_demand": 3,
    "storytelling": 3,
}

# --- Helpers -----------------------------------------------------------------
import re

def normalize_token(tok: str) -> str:
    """Lowercase, unify dashes/underscores, apply aliases."""
    tok = (tok or "").strip().lower()
    tok = tok.replace("_", "-")                # treat '_' and '-' the same
    tok = ALIASES.get(tok, tok)                # map known variants
    return tok

def split_tokens(value):
    """Split an interaction feature *string* or list into normalized tokens."""
    if isinstance(value, list):
        return [normalize_token(t) for t in value]
    if not isinstance(value, str):
        return []
    # split on commas, pipes, semicolons, or whitespace
    raw = re.split(r"[,\|\;]\s*|\s+", value)
    return [normalize_token(t) for t in raw if t]

def interaction_level_capped(value) -> int:
    """
    Return the MAX family weight present in the tokens (0 if none recognized).
    """
    toks = split_tokens(value)
    max_level = 0
    for tok in toks:
        for fam, keywords in FAMILIES.items():
            if tok in keywords:
                lvl = FAMILY_WEIGHT[fam]
                if lvl > max_level:
                    max_level = lvl
    return max_level

# --- Apply correctly (IMPORTANT: apply to the *column*, not the whole row) ---
INTERACTION_COL = "chart_interaction_feature" 

df["interaction_level"] = df[INTERACTION_COL].apply(interaction_level_capped)

print("Added interaction_level. Sample:")
print(df[["chart_type", INTERACTION_COL, "interaction_level"]].head(10))

out_path = "website_structure_with_interactions.csv"
df.to_csv(out_path, index=False, encoding="utf-8-sig")


Added interaction_level. Sample:
  chart_type chart_interaction_feature  interaction_level
0        NaN                       NaN                  0
1        NaN                       NaN                  0
2        NaN                       NaN                  0
3        NaN                       NaN                  0
4       line                       NaN                  0
5        bar                       NaN                  0
6        map                      zoom                  2
7   doughnut                       NaN                  0
8        bar                       NaN                  0
9        NaN                       NaN                  0
