In [169]:
from spark.utils import get_channel_names
import spark.params as params
from spark.preprocessor import load_q_data, load_timeseries_data, train_test_split_ids, Preprocess_Q
from tsfresh.feature_extraction import extract_features, MinimalFCParameters
from tsfresh.feature_selection import select_features
from sklearn.preprocessing import MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight, compute_sample_weight

import pandas as pd
import numpy as np

from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import balanced_accuracy_score, classification_report

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
raw_prep_dir  = '../processed_data/'

In [3]:
X_q_data, y_data, id_list = load_q_data(raw_prep_dir + 'merged_dfq.csv')

In [4]:
id_list

384      1
120      2
3        3
212      4
153      5
      ... 
51     465
430    466
397    467
96     468
23     469
Name: id, Length: 469, dtype: int64

In [5]:
X_q_data.head()

Unnamed: 0,id,age,age_at_diagnosis,bmi,height,weight,gender,handedness,appearance_in_kinship,01,...,21,22,23,24,25,26,27,28,29,30
384,1,56,56,26.061679,173,78,male,right,True,False,...,False,False,False,False,False,False,False,False,False,False
120,2,81,69,27.920213,193,104,male,right,False,True,...,True,True,True,False,True,False,True,False,True,False
3,3,45,45,26.989619,170,78,female,right,False,False,...,False,False,False,False,False,False,False,False,False,False
212,4,67,63,34.720883,161,90,female,right,False,False,...,True,True,True,False,False,True,True,True,False,False
153,5,75,65,29.069767,172,86,male,left,False,True,...,False,True,True,True,True,True,True,False,False,False


In [8]:
time_data = load_timeseries_data('../processed_data/')

In [16]:
time_data.shape

(469, 132, 976)

In [10]:
X_train_id, X_test_id, y_train, y_test = train_test_split_ids (id_list, y_data, test_size = 0.2, random_state = None, stratify = True)

In [11]:
print(X_train_id.shape, X_test_id.shape)

(375,) (94,)


In [12]:
X_train_time = time_data.sel(id = list(X_train_id))
X_test_time = time_data.sel(id = list(X_test_id))
X_train_q = X_q_data[X_q_data.id.isin(X_train_id)]
X_test_q = X_q_data[X_q_data.id.isin(X_test_id)]
print(X_train_time.to_numpy().shape, X_train_q.shape)

(375, 132, 976) (375, 39)


In [63]:
X_train_q

Unnamed: 0,id,age,age_at_diagnosis,bmi,height,weight,gender,handedness,appearance_in_kinship,01,...,21,22,23,24,25,26,27,28,29,30
384,1,56,56,26.061679,173,78,male,right,True,False,...,False,False,False,False,False,False,False,False,False,False
120,2,81,69,27.920213,193,104,male,right,False,True,...,True,True,True,False,True,False,True,False,True,False
212,4,67,63,34.720883,161,90,female,right,False,False,...,True,True,True,False,False,True,True,True,False,False
153,5,75,65,29.069767,172,86,male,left,False,True,...,False,True,True,True,True,True,True,False,False,False
301,6,72,60,39.328340,171,115,female,right,False,False,...,False,False,True,False,False,True,True,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
70,464,51,36,20.085066,184,68,male,right,False,True,...,True,False,True,False,False,False,False,False,False,True
51,465,65,62,26.122449,175,80,male,right,True,True,...,False,False,False,False,True,False,False,False,False,False
430,466,84,84,25.013521,172,74,female,right,True,False,...,False,False,False,False,False,False,False,False,False,False
397,467,57,55,27.700831,190,100,male,right,False,False,...,False,False,True,False,False,False,False,False,False,False


In [64]:
final_col = ['age_at_diagnosis','02','03','age','20','gender','17','appearance_in_kinship','13','09']
X_train_q_top = X_train_q[final_col]
X_test_q_top = X_test_q[final_col]



In [65]:
X_train_q_top

Unnamed: 0,age_at_diagnosis,02,03,age,20,gender,17,appearance_in_kinship,13,09
384,56,False,False,56,False,male,False,True,False,False
120,69,True,False,81,True,male,True,False,False,True
212,63,True,False,67,False,female,False,False,False,True
153,65,True,True,75,True,male,False,False,False,False
301,60,True,False,72,True,female,False,False,True,True
...,...,...,...,...,...,...,...,...,...,...
70,36,False,True,51,False,male,False,False,False,False
51,62,False,False,65,False,male,False,True,False,True
430,84,False,False,84,False,female,False,True,False,True
397,55,False,False,57,False,male,False,False,True,True


In [13]:
assert((X_train_time.id.to_numpy() == X_train_q.id.to_numpy()).any)

In [68]:
prep = Preprocess_Q(feature_importance = True)
prep.fit(X_train_q_top)
X_train_q_prep = prep.transform(X_train_q_top)

X_test_q_prep = prep.transform(X_test_q_top)
pd.DataFrame(X_train_q_prep)
#pd.DataFrame(X_test_q_prep)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.181818,-0.6250,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.606061,0.9375,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0
2,0.242424,0.0625,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
3,0.363636,0.5625,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0
4,0.060606,0.3750,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
370,-1.393939,-0.9375,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
371,0.181818,-0.0625,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
372,1.515152,1.1250,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
373,-0.242424,-0.5625,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0


In [24]:
N, F, T = X_train_time.shape
feature_names = [f"feat_{i}" for i in range(F)]

def make_long_df(batch_data: np.ndarray, id_offset: int = 0) -> pd.DataFrame:

    n_batch = batch_data.shape[0]


    x = np.transpose(batch_data, (0, 2, 1))
    values = x.reshape(-1)

    ids = np.repeat(np.arange(id_offset, id_offset + n_batch), T * F)


    times = np.tile(np.repeat(np.arange(T), F), n_batch)


    kinds_idx = np.tile(np.arange(F), n_batch * T)
    kinds = np.array(feature_names, dtype=object)[kinds_idx]

    return pd.DataFrame({"id": ids, "time": times, "kind": kinds, "value": values})

In [26]:
type(X_train_time)

xarray.core.dataarray.DataArray

In [45]:
df_long_train = make_long_df(X_train_time.values, id_offset=0)
df_long_train

Unnamed: 0,id,time,kind,value
0,0,0,feat_0,0.002823
1,0,0,feat_1,-0.003085
2,0,0,feat_2,0.001840
3,0,0,feat_3,-0.001343
4,0,0,feat_4,0.005593
...,...,...,...,...
48311995,374,975,feat_127,-0.009145
48311996,374,975,feat_128,-0.101842
48311997,374,975,feat_129,0.326674
48311998,374,975,feat_130,0.243035


In [31]:
X_train_features = extract_features(
    df_long_train,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=0
)

Feature Extraction: 100%|██████████| 49500/49500 [00:44<00:00, 1118.86it/s]


In [32]:
print(X_train_features.shape)

(375, 1320)


In [46]:
X_train_features.head()

Unnamed: 0,feat_0__sum_values,feat_0__median,feat_0__mean,feat_0__length,feat_0__standard_deviation,feat_0__variance,feat_0__root_mean_square,feat_0__maximum,feat_0__absolute_maximum,feat_0__minimum,...,feat_99__sum_values,feat_99__median,feat_99__mean,feat_99__length,feat_99__standard_deviation,feat_99__variance,feat_99__root_mean_square,feat_99__maximum,feat_99__absolute_maximum,feat_99__minimum
0,0.158396,8.5e-05,0.000162,976.0,0.001933,4e-06,0.00194,0.006761,0.007114,-0.007114,...,-62.399082,-0.137665,-0.063933,976.0,4.14371,17.170334,4.144204,11.884709,11.884709,-10.194261
1,-0.223625,-0.000125,-0.000229,976.0,0.001828,3e-06,0.001842,0.004373,0.007245,-0.007245,...,-52.867386,-0.011079,-0.054167,976.0,0.574344,0.329871,0.576893,2.452682,2.452682,-1.717518
2,-0.234269,-0.0003,-0.00024,976.0,0.00777,6e-05,0.007774,0.065478,0.065478,-0.045651,...,-130.762421,-0.484346,-0.133978,976.0,3.186254,10.152214,3.18907,9.6893,9.6893,-7.546668
3,-0.103706,-9.3e-05,-0.000106,976.0,0.004223,1.8e-05,0.004224,0.017589,0.01762,-0.01762,...,74.149582,-0.256583,0.075973,976.0,1.296385,1.680613,1.298609,4.084694,4.084694,-3.069789
4,-0.082906,-5.6e-05,-8.5e-05,976.0,0.002252,5e-06,0.002254,0.00652,0.007024,-0.007024,...,42.740337,-0.747186,0.043791,976.0,3.920198,15.367954,3.920443,11.733592,11.733592,-8.596663


In [36]:
y_train.unique

<bound method Series.unique of 360    1
337    0
135    2
177    1
270    1
      ..
319    0
336    1
284    1
429    2
318    2
Name: label, Length: 375, dtype: int64>

In [49]:
type(y_train)
y_train = np.array(y_train)

In [147]:
y_series = pd.Series(y_train, index=X_train_features.index)
classes = np.unique(y_series)


selected_sets = []
for c in classes:
    y_bin = (y_series == c).astype(int)
    sel_c = select_features(X_train_features, y_bin, fdr_level=0.05)
    selected_sets.append(set(sel_c.columns))

selected_cols = sorted(set().union(*selected_sets))
X_train_selected = X_train_features[selected_cols]
print(f"Selected {len(selected_cols)} / {X_train_features.shape[1]} features")






Selected 254 / 1320 features


In [148]:
X_train_selected.head()

Unnamed: 0,feat_100__maximum,feat_100__minimum,feat_100__root_mean_square,feat_100__standard_deviation,feat_100__variance,feat_101__median,feat_101__minimum,feat_105__median,feat_106__maximum,feat_106__root_mean_square,...,feat_84__variance,feat_85__root_mean_square,feat_85__standard_deviation,feat_85__variance,feat_8__standard_deviation,feat_8__variance,feat_90__root_mean_square,feat_90__standard_deviation,feat_90__variance,feat_95__median
0,4.959395,-4.134089,2.345905,2.258628,5.101401,-0.119833,-6.955344,-0.097912,2.495438,1.542564,...,0.001622,0.077882,0.077858,0.006062,0.001635,3e-06,0.038242,0.038242,0.001462,-0.007335
1,1.285755,-1.667742,0.482909,0.480259,0.230649,-0.000534,-4.571373,-0.081427,2.667811,1.107798,...,0.00037,0.033488,0.03347,0.00112,0.001208,1e-06,0.017995,0.017983,0.000323,0.002526
2,8.514721,-4.03378,2.23057,2.199674,4.838568,-0.01158,-7.702114,-0.332327,3.652837,1.926051,...,0.005546,0.082859,0.082855,0.006865,0.0031,1e-05,0.052015,0.052011,0.002705,0.003183
3,2.912025,-2.009546,0.864633,0.859268,0.738342,-0.126767,-3.534592,-0.111969,2.96433,1.003664,...,0.011725,0.128854,0.128853,0.016603,0.005321,2.8e-05,0.125043,0.125042,0.015636,0.005562
4,4.726487,-2.703126,2.117576,2.045352,4.183465,0.001328,-4.741699,-0.650558,1.806423,2.088197,...,0.000591,0.03549,0.035466,0.001258,0.001449,2e-06,0.017838,0.017824,0.000318,-0.004158


In [149]:
df_long_test = make_long_df(X_test_time.values, id_offset=0)

In [150]:
X_test_features = extract_features(
    df_long_test,
    column_id="id",
    column_sort="time",
    column_kind="kind",
    column_value="value",
    default_fc_parameters=MinimalFCParameters(),
    n_jobs=0
)

Feature Extraction: 100%|██████████| 12408/12408 [00:09<00:00, 1248.49it/s]


In [151]:
X_test_selected = X_test_features[selected_cols]

In [152]:
X_test_selected

Unnamed: 0,feat_100__maximum,feat_100__minimum,feat_100__root_mean_square,feat_100__standard_deviation,feat_100__variance,feat_101__median,feat_101__minimum,feat_105__median,feat_106__maximum,feat_106__root_mean_square,...,feat_84__variance,feat_85__root_mean_square,feat_85__standard_deviation,feat_85__variance,feat_8__standard_deviation,feat_8__variance,feat_90__root_mean_square,feat_90__standard_deviation,feat_90__variance,feat_95__median
0,3.895046,-2.148582,1.227881,1.223989,1.498150,0.136628,-4.329083,0.020443,1.636675,0.598446,...,0.000811,0.020065,0.020062,0.000402,0.003819,0.000015,0.027523,0.027521,0.000757,0.007986
1,1.632197,-1.945521,0.895854,0.888957,0.790244,-0.024904,-2.880561,-0.021151,2.709891,1.070150,...,0.002131,0.059436,0.059432,0.003532,0.002448,0.000006,0.041259,0.041254,0.001702,0.005641
2,3.188351,-3.342804,1.980489,1.980464,3.922239,-0.091721,-2.378431,0.314838,2.798137,1.597738,...,0.000571,0.037766,0.037751,0.001425,0.004587,0.000021,0.034479,0.034470,0.001188,0.006722
3,3.258142,-1.811927,1.163121,1.110821,1.233922,-0.573627,-3.676279,-0.147878,1.883852,1.401903,...,0.002846,0.104306,0.104298,0.010878,0.002394,0.000006,0.056255,0.056247,0.003164,0.023946
4,3.423152,-3.202383,1.467839,1.460166,2.132083,-0.030701,-4.477514,-0.028446,2.366535,1.052879,...,0.002438,0.043885,0.043884,0.001926,0.005955,0.000035,0.043877,0.043876,0.001925,-0.002161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,2.478085,-1.632913,0.960623,0.918903,0.844382,0.034982,-3.430696,-0.007807,1.305238,0.630293,...,0.000494,0.036333,0.036332,0.001320,0.002679,0.000007,0.016984,0.016969,0.000288,0.010412
90,3.356393,-3.939065,2.049004,2.048906,4.198016,-0.107830,-1.768520,-0.127655,4.610572,2.159753,...,0.006855,0.127334,0.127316,0.016209,0.001199,0.000001,0.086868,0.086863,0.007545,0.007881
91,2.913451,-1.781711,1.081082,1.068100,1.140838,-0.126960,-3.801286,-0.295345,1.602012,1.095033,...,0.000755,0.048651,0.048650,0.002367,0.149617,0.022385,0.029258,0.029251,0.000856,0.003043
92,4.734429,-2.355870,1.308812,1.284223,1.649229,-0.328255,-4.556039,-0.317510,3.150414,1.196615,...,0.000782,0.041659,0.041659,0.001735,0.001403,0.000002,0.022044,0.022004,0.000484,0.008648


In [153]:
scaler = MinMaxScaler()
X_train_time_scaled = scaler.fit_transform(X_train_selected)

In [154]:
X_train_time_scaled = pd.DataFrame(X_train_time_scaled)

In [155]:
X_test_time_scaled = scaler.transform(X_test_selected)

In [156]:
X_test_time_scaled = pd.DataFrame(X_test_time_scaled)
X_test_time_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,244,245,246,247,248,249,250,251,252,253
0,0.452385,0.839070,0.280445,0.279751,0.081391,0.915886,0.545446,0.740097,0.171697,0.186204,...,0.002908,0.048459,0.048451,0.003471,0.007980,0.000105,0.035284,0.035292,0.001530,0.049171
1,0.184147,0.855333,0.202479,0.201017,0.042903,0.760406,0.702144,0.720421,0.301818,0.348863,...,0.007703,0.168055,0.168047,0.031647,0.004185,0.000039,0.055001,0.055005,0.003460,0.046193
2,0.368613,0.743429,0.457171,0.457525,0.213185,0.696093,0.756464,0.879355,0.312517,0.530791,...,0.002037,0.102229,0.102186,0.012678,0.010107,0.000154,0.045269,0.045267,0.002410,0.047567
3,0.376886,0.866032,0.265238,0.253156,0.067025,0.232242,0.616065,0.660475,0.201666,0.463261,...,0.010302,0.304359,0.304338,0.097781,0.004035,0.000037,0.076526,0.076527,0.006447,0.069443
4,0.396447,0.754675,0.336792,0.335253,0.115857,0.754826,0.529389,0.716970,0.260188,0.342907,...,0.008819,0.120817,0.120816,0.017185,0.013892,0.000264,0.058759,0.058769,0.003916,0.036283
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,0.284419,0.880368,0.217688,0.208055,0.045846,0.818048,0.642632,0.726733,0.131512,0.197186,...,0.001757,0.097874,0.097876,0.011732,0.004823,0.000048,0.020156,0.020145,0.000571,0.052253
90,0.388533,0.695677,0.473260,0.473609,0.228178,0.680587,0.822443,0.670041,0.532264,0.724591,...,0.024862,0.374312,0.374259,0.145777,0.000729,0.000004,0.120470,0.120473,0.015399,0.049039
91,0.336027,0.868451,0.245974,0.243116,0.061964,0.662174,0.602542,0.590718,0.167494,0.357443,...,0.002707,0.135294,0.135294,0.021156,0.411586,0.170652,0.037774,0.037775,0.001731,0.042894
92,0.551885,0.822469,0.299449,0.293906,0.089605,0.468420,0.520894,0.580233,0.355229,0.392472,...,0.002803,0.114055,0.114058,0.015472,0.001291,0.000008,0.027418,0.027373,0.000972,0.050012


In [157]:
X_combined_train_scaled = np.concatenate((X_train_time_scaled, X_train_q_prep), axis = 1)
X_combined_test_scaled = np.concatenate((X_test_time_scaled, X_test_q_prep), axis = 1)

In [158]:
X_combined_train = np.concatenate((X_train_selected, X_train_q_prep), axis = 1)
X_combined_test = np.concatenate((X_test_selected, X_test_q_prep), axis = 1)

In [159]:
X_combined_train_scaled.shape

(375, 264)

A base SVC


In [174]:
sv_model_combined = SVC(kernel = 'linear', class_weight = 'balanced', C = 10)
sv_model_combined.fit(X_combined_train_scaled, y_train)

0,1,2
,C,10
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,'balanced'


In [175]:
y_pred_combo_sv = sv_model_combined.predict(X_combined_test)

In [176]:
score1 = balanced_accuracy_score(y_test, y_pred_combo_sv)
score1

0.44021739130434784

In [170]:
sample_weights = compute_sample_weight(class_weight="balanced", y=y_train)


xgb = XGBClassifier()

xgb.fit(X_combined_train, y_train, sample_weight=sample_weights)

y_pred = xgb.predict(X_combined_test)



print("Balanced Accuracy:", balanced_accuracy_score(y_test, y_pred))

Balanced Accuracy: 0.5616765480895917


stacking models? 