In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

In [3]:
train_df = pd.read_csv("child-mind-institute-problematic-internet-use/train.csv")
test_df = pd.read_csv("child-mind-institute-problematic-internet-use/test.csv")

In [None]:
conflict_rows = train_df[(train_df['PAQ_A-PAQ_A_Total'].notna()) & (train_df['PAQ_C-PAQ_C_Total'].notna())]

# 判斷是否存在衝突行
if not conflict_rows.empty:
    train_df = train_df.drop(conflict_rows.index)


In [5]:
# 將合併結果存回 column1
train_df['PAQ_A-PAQ_A_Total'] = train_df['PAQ_A-PAQ_A_Total'].fillna(train_df['PAQ_C-PAQ_C_Total'])
train_df['PAQ_A-Season'] = train_df['PAQ_A-Season'].fillna(train_df['PAQ_C-Season'])
test_df['PAQ_A-PAQ_A_Total'] = test_df['PAQ_A-PAQ_A_Total'].fillna(test_df['PAQ_C-PAQ_C_Total'])
test_df['PAQ_A-Season'] = test_df['PAQ_A-Season'].fillna(test_df['PAQ_C-Season'])

# 刪除 column2
train_df = train_df.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])
test_df = test_df.drop(columns=['PAQ_C-PAQ_C_Total', 'PAQ_C-Season'])

train_df = train_df.rename(columns={'PAQ_A-Season': 'PAQ-Season'})
train_df = train_df.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})
test_df = test_df.rename(columns={'PAQ_A-Season': 'PAQ-Season'})
test_df = test_df.rename(columns={'PAQ_A-PAQ_A_Total': 'PAQ-PAQ_Total'})


In [6]:
df = train_df.dropna(axis=1, thresh=len(train_df) - 3000)

In [7]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

In [8]:
def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"Stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    
    return df

In [9]:
# 把SII是空的column刪除
train_df = train_df.dropna(subset=['sii'])

In [10]:
# PCIAT 有些欄位是空的，會影響最後SII結果，把若填滿PCIAT有可能改變SII的column刪除
PCIAT_cols = [f'PCIAT-PCIAT_{i+1:02d}' for i in range(20)]
def IncorrectRows(row):
    if pd.isna(row['PCIAT-PCIAT_Total']):
        return np.nan
    max_possible = row['PCIAT-PCIAT_Total'] + row[PCIAT_cols].isna().sum() * 5
    if row['PCIAT-PCIAT_Total'] <= 30 and max_possible <= 30:
        return 0
    elif 31 <= row['PCIAT-PCIAT_Total'] <= 49 and max_possible <= 49:
        return 1
    elif 50 <= row['PCIAT-PCIAT_Total'] <= 79 and max_possible <= 79:
        return 2
    elif row['PCIAT-PCIAT_Total'] >= 80 and max_possible >= 80:
        return 3
    return np.nan

train_df['recal_sii'] = train_df.apply(IncorrectRows, axis=1)

In [11]:
mismatch_rows = train_df[
    (train_df['recal_sii'] != train_df['sii']) & train_df['sii'].notna()
]
mismatch_indexes = mismatch_rows.index
train_df = train_df.drop(mismatch_indexes)
train_df = train_df.drop(['recal_sii'], axis=1)

In [12]:
# 把有關Season的column做mapping 
SEASON_COLS = [
    "Basic_Demos-Enroll_Season", 
    "CGAS-Season", 
    "Physical-Season", 
    "Fitness_Endurance-Season", 
    "FGC-Season", 
    "BIA-Season", 
    "PAQ-Season", 
    "SDS-Season",
    "PreInt_EduHx-Season", 
    ]
def update(df):
    for c in SEASON_COLS: 
        df[c] = df[c].fillna('Missing')
        df[c] = df[c].astype('category')
    return df
train_df = update(train_df)
test_df = update(test_df)
season_mapping = {'Spring': 0, 'Summer': 1, 'Fall': 2, 'Winter': 3, 'Missing': 4}
for col in SEASON_COLS:
    train_df[col] = train_df[col].map(season_mapping)
    test_df[col] = test_df[col].map(season_mapping)
train_df['PCIAT-Season'] = train_df['PCIAT-Season'].map(season_mapping)

In [13]:
# 做Imputer
train_id = train_df['id']
test_id = test_df['id']
train_features = train_df.drop(columns=['id'])
test_features = test_df.drop(columns=['id'])

imputer = SimpleImputer(strategy='median')
train_features_imputed = pd.DataFrame(imputer.fit_transform(train_features), columns=train_features.columns, index=train_features.index)
test_features_imputed = pd.DataFrame(imputer.fit_transform(test_features), columns=test_features.columns, index=test_features.index)

train_df = pd.concat([train_id, train_features_imputed], axis=1)
test_df = pd.concat([test_id, test_features_imputed], axis=1)


In [None]:
# 尋找和PCIAT_Total相關性低的column並刪除 
train_cor = train_df.drop('id', axis=1)
test_cor = test_df.drop('id', axis=1)
corr_matrix = train_cor[['PCIAT-PCIAT_Total', 'Basic_Demos-Age', 'Basic_Demos-Sex', 'Physical-BMI', 
                        'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
                        'Physical-Diastolic_BP', 'Physical-Systolic_BP', 'Physical-HeartRate',
                        'PreInt_EduHx-computerinternet_hoursday', 'SDS-SDS_Total_T',
                        'PAQ-PAQ_Total', 'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins', 
                        'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_GSND', 'FGC-FGC_GSD', 
                        'FGC-FGC_PU', 'FGC-FGC_SRL', 'FGC-FGC_SRR', 'FGC-FGC_TL', 'BIA-BIA_Activity_Level_num', 
                        'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
                        'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 
                        'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW']].corr()
sii_corr = corr_matrix['PCIAT-PCIAT_Total'].drop('PCIAT-PCIAT_Total')
filtered_corr = sii_corr[(sii_corr > 0.1) | (sii_corr < -0.1)]
other_corr = sii_corr[(sii_corr <= 0.1) & (sii_corr >= -0.1)]
other_corr_columns = other_corr.index.tolist()

Basic_Demos-Sex               -0.094312
Physical-Diastolic_BP          0.066374
Physical-HeartRate            -0.035771
PAQ-PAQ_Total                 -0.042217
Fitness_Endurance-Max_Stage   -0.020330
Fitness_Endurance-Time_Mins   -0.038346
Fitness_Endurance-Time_Sec     0.001800
FGC-FGC_SRL                   -0.073663
FGC-FGC_SRR                   -0.064219
BIA-BIA_Activity_Level_num     0.075633
BIA-BIA_BMC                   -0.007859
BIA-BIA_BMR                    0.028779
BIA-BIA_DEE                    0.041886
BIA-BIA_ECW                    0.027491
BIA-BIA_FFM                    0.028779
BIA-BIA_FFMI                   0.085982
BIA-BIA_FMI                    0.066753
BIA-BIA_Fat                    0.031164
BIA-BIA_ICW                    0.041286
BIA-BIA_LDM                    0.019975
BIA-BIA_LST                    0.059496
BIA-BIA_SMM                    0.041344
BIA-BIA_TBW                    0.033559
Name: PCIAT-PCIAT_Total, dtype: float64


In [15]:
train_df = train_df.drop(columns=other_corr_columns)
test_df = test_df.drop(columns=other_corr_columns)

In [16]:
# 把parquet data加進去 
train_ts = load_time_series("child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("child-mind-institute-problematic-internet-use/series_test.parquet")

  0%|          | 0/996 [00:00<?, ?it/s]

100%|██████████| 996/996 [01:21<00:00, 12.22it/s]
100%|██████████| 2/2 [00:00<00:00, 11.38it/s]


In [17]:
TARGET_COLS = [
    "PCIAT-Season",
    "PCIAT-PCIAT_01",
    "PCIAT-PCIAT_02",
    "PCIAT-PCIAT_03",
    "PCIAT-PCIAT_04",
    "PCIAT-PCIAT_05",
    "PCIAT-PCIAT_06",
    "PCIAT-PCIAT_07",
    "PCIAT-PCIAT_08",
    "PCIAT-PCIAT_09",
    "PCIAT-PCIAT_10",
    "PCIAT-PCIAT_11",
    "PCIAT-PCIAT_12",
    "PCIAT-PCIAT_13",
    "PCIAT-PCIAT_14",
    "PCIAT-PCIAT_15",
    "PCIAT-PCIAT_16",    
    "PCIAT-PCIAT_17",
    "PCIAT-PCIAT_18",
    "PCIAT-PCIAT_19",
    "PCIAT-PCIAT_20",
    "PCIAT-PCIAT_Total"
]
train_df = train_df.drop(TARGET_COLS,axis=1)

In [18]:
train_df = pd.merge(train_df, train_ts, how="left", on='id')
test_df = pd.merge(test_df, test_ts, how="left", on='id')

In [19]:
train_df = train_df.drop('id', axis=1)
test_df = test_df.drop('id', axis=1)

In [20]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [None]:
from imblearn.over_sampling import SMOTE
X_train = train_df.drop(columns=['sii'])  # 假設 'sii' 是目標欄位
y_train = train_df['sii']

# 使用 SMOTE 進行過採樣
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [26]:
X_train_resampled

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Systolic_BP,...,Stat_86,Stat_87,Stat_88,Stat_89,Stat_90,Stat_91,Stat_92,Stat_93,Stat_94,Stat_95
0,2.000000,5.000000,3.000000,51.000000,2.000000,16.877316,46.000000,50.800000,26.000000,114.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000
1,1.000000,9.000000,4.000000,65.000000,2.000000,14.035590,48.000000,46.000000,22.000000,122.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000
2,1.000000,10.000000,2.000000,71.000000,2.000000,16.648696,56.500000,75.600000,26.000000,117.000000,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000
3,3.000000,9.000000,2.000000,71.000000,1.000000,18.292347,56.000000,81.600000,26.000000,117.000000,...,1.546979,4.004276,89.751656,0.0,2633.250000,4188.500000,8.611000e+13,7.0,3.0,85.000000
4,0.000000,13.000000,3.000000,50.000000,1.000000,22.279952,59.500000,112.200000,26.000000,102.000000,...,1.146284,2.952888,89.476036,1.0,2597.800049,4175.000000,8.639500e+13,7.0,3.0,91.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6339,1.299801,12.299801,0.899402,77.001995,2.299801,27.994593,65.599601,171.102593,26.000000,130.303790,...,1.030342,1.382063,88.721179,1.0,2209.489687,4190.601596,8.639500e+13,7.0,1.0,70.213165
6340,0.353541,14.292917,1.000000,45.000000,1.707083,28.676977,66.918187,182.725328,26.000000,144.495192,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000
6341,0.974194,14.000000,3.948389,65.077417,1.077417,20.305357,61.332264,108.727772,26.000000,127.638722,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000
6342,1.847061,17.000000,3.000000,53.058783,2.000000,22.445850,66.464713,141.155331,26.000000,128.847061,...,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000e+00,0.0,0.0,0.000000


In [27]:
y_train_resampled

0       2.0
1       0.0
2       0.0
3       1.0
4       1.0
       ... 
6339    3.0
6340    3.0
6341    3.0
6342    3.0
6343    3.0
Name: sii, Length: 6344, dtype: float64

In [28]:
X_train = X_train_resampled
y_train = y_train_resampled
X_test = test_df

model = RandomForestClassifier(random_state=0)
model.fit(X_train, y_train)

test_df['sii'] = model.predict(X_test)

In [29]:
submit_df = pd.concat([test_id, test_df['sii']], axis=1)
submit_df['sii'] = submit_df['sii'].astype(int)

In [33]:
submit_df

Unnamed: 0,id,sii
0,00008ff9,2
1,000fd460,0
2,00105258,0
3,00115b9f,1
4,0016bb22,2
5,001f3379,1
6,0038ba98,0
7,0068a485,0
8,0069fbed,0
9,0083e397,0


In [34]:
submit_df.to_csv('submission.csv', index=False)