In [None]:
import json
import pandas as pd
import ast
import re
from google.colab import files


file_path = "/content/full_processed_recommendation_dataset.json"

with open(file_path, "r", encoding='utf-8') as f:
    records = [json.loads(line) for line in f]

df = pd.DataFrame(records)

#split
def split_column(df, column, new_cols):
    split_data = df[column].str.split('|', expand=True)
    for i, col_name in enumerate(new_cols):
        df[col_name] = split_data[i].str.strip()
    df.drop(column, axis=1, inplace=True)

split_column(df, "VerbalIQ", ["VerbalIQ_Level", "VerbalIQ_Response", "VerbalIQ_Confidence"])
split_column(df, "DisorderDegree", ["DisorderDegree_Severity", "DisorderDegree_Confirmed"])
split_column(df, "LearningAbility", ["LearningSpeed", "LearningMethod"])
split_column(df, "GraspingPower", ["MemoryRetention", "SkillApplication"])

#sepration
list_columns = ["DisorderType", "DiagnosisTypes", "Hobbies", "HobbiesList", "RecommendedPathways"]

def clean_list_string(s):
    if isinstance(s, str):
        try:
            return ast.literal_eval(s)
        except:
            return []
    return []

for col in list_columns:
    df[col] = df[col].apply(clean_list_string)

# Drop duplicate hobby column
df.drop(columns=["HobbiesList"], inplace=True)

#clean agegroup
def parse_age_group(age):
    if isinstance(age, str):
        age = age.replace('\u00c3\u00a2\u00c2\u0080\u00c2\u0093', '-').replace('\u2013', '-')
        match = re.findall(r'\d+', age)
        if len(match) == 2:
            return (int(match[0]) + int(match[1])) // 2
        elif len(match) == 1:
            return int(match[0])
    return None

df['AgeGroupNumeric'] = df['AgeGroup'].apply(parse_age_group)
df.drop(columns=["AgeGroup"], inplace=True)

threshold = 0.9  # Drop if 90% or more are null
df.dropna(axis=1, thresh=int((1 - threshold) * len(df)), inplace=True)


drop_columns = ["DiagnosisConfirmed", "ActivityPreference", "EngagementDuration"]
for col in drop_columns:
    if col in df.columns:
        df.drop(columns=[col], inplace=True)


#One-hot encode RecommendedPathways
all_pathways = set()
df['RecommendedPathways'].apply(lambda x: all_pathways.update(x if isinstance(x, list) else []))

for path in sorted(all_pathways):
    colname = f"Pathway_{path}"
    df[colname] = df['RecommendedPathways'].apply(lambda x: 1 if path in x else 0)

df.drop(columns=["RecommendedPathways"], inplace=True)


output_file = "cleaned_dataset_with_pathways.csv"
df.to_csv(output_file, index=False)
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from google.colab import files



df = pd.read_csv("/content/cleaned_dataset_with_pathways.csv")


import ast

def parse_list_column(x):
    if pd.isna(x):
        return []
    try:
        return ast.literal_eval(x)
    except:
        return []

for col in ['Hobbies', 'DiagnosisTypes', 'DisorderType']:
    df[col] = df[col].apply(parse_list_column)

# One-hot encode each multi-label column
def one_hot_multilabel(df, column, prefix):
    mlb = MultiLabelBinarizer()
    encoded = pd.DataFrame(mlb.fit_transform(df[column]), columns=[f"{prefix}_{label}" for label in mlb.classes_])
    df = pd.concat([df.drop(columns=[column]), encoded], axis=1)
    return df

df = one_hot_multilabel(df, 'Hobbies', 'Hobby')
df = one_hot_multilabel(df, 'DiagnosisTypes', 'Diagnosis')
df = one_hot_multilabel(df, 'DisorderType', 'Disorder')

output_file = "final_dataset_with_multilabels.csv"
df.to_csv(output_file, index=False)
files.download(output_file)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>