In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
from tqdm import tqdm
tqdm.pandas()
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from scipy.stats.stats import pearsonr 
from sklearn.decomposition import KernelPCA

# Description Algorithm

At the dawn of times the preprocessing was based on computing the correlation matrix between the different $\texttt{ROIs}$ (SDS HW03 anyone?) and excluding some unnecessary variables (like `var_1`,`var_2` and `var_3`) and by training a good old `SVM`. 

The first big breaktrough was thanks to the *VEGA* algorithm, found in [this](https://doi.org/10.1101/2021.03.18.435935) paper. The idea is that we need a method to know which features are *really* relevant, above the noise level. This is done by computing the so called *Ricci-Forman curvature* for each edge. This allows, after some steps, to have a sense of which nodes ($\texttt{ROIs}$) in the correlation graph are most important for the cohesion of the network. 
Obtaining in this way a correlation dataframe odf shape (600,1653)


After the selection of this subsets of ROIs, we decide to aggregate samples according their labels(intelligence scores) in order to approximate better the correlation between different ROIs of the brain of the same people.
Reducing the correlation dataframe to shape(99,1653) where 99 is the number of unique 'labels'.


In ordert to reduce even further the amount of features for each sample we use`KernelPCA`.
After PCA the resulting correlation dataframe has a shape of (99,24).



In [None]:
train_df=pd.read_csv('../input/statistical-learning-sapienza-spring-2021/train.csv')

In [None]:
target_values=train_df['y'].unique()

In [None]:
Ric_subjects=[]
epsi=1/116
for index,row in tqdm(train_df.iterrows()):
    subject = np.array(row[5:]).reshape((116, 115)).astype(float)
    corr = np.corrcoef(subject)
    Ric=np.zeros((116,116))
    corr_pos=np.fmax(corr,epsi)
    for i in range(0,116):
        for j in range(i,116):
            val=2-corr_pos[i,j]*(np.sum(1/(np.sqrt(corr_pos[i,j]*corr_pos[i])))+np.sum(1/(np.sqrt(corr_pos[i,j]*corr_pos[j]))))
            Ric[i,j]=val
            Ric[j,i]=val
    Ric_subjects.append(np.mean(Ric,axis=1))

In [None]:
Ric_subjects_arr=np.array(Ric_subjects)
avg=Ric_subjects_arr.mean(axis=0)
usable_roi=avg>=np.median(avgROI=np.array([f'ROI{i}' for i in range(1,117))])[avg>=np.median(avg

ROI=np.array([f'ROI{i}' for i in range(1,117)])[avg>=np.median(avg)]                         )]
columns=[ROI[i]+'-'+ROI[j] for i in range(0,len(ROI)-1) for j in range(i+1,len(ROI))]

In [None]:
m_ricci = []
for v in tqdm(range(len(target_values))):
    corr=[]
    df_y = train_df[train_df['y']==target_values[v]].iloc[:, 5:]
    for i in range(0,116):
        if usable_roi[i]:
            roi_i=df_y.iloc[:,i*115:(i+1)*115]
            for j in range(i+1,116):
                if usable_roi[j]:
                    roi_j=df_y.iloc[:,(j)*115:(j+1)*115]
                    corr.append(pearsonr(roi_i.values.flatten(),roi_j.values.flatten())[0])
    m_ricci.append(corr)
m_ricci = np.array(m_ricci)


In [None]:
def get_corr(row,usable_roi,test=False):
    if test:
        brain = np.array(row[4:])
    else:
        brain = np.array(row[5:])
    #brain = brain.reshape((116, 115)).astype(float)
    #corr = np.corrcoef(brain)
    #flat = corr[np.triu_indices(corr.shape[0], 1)]
    corr = []
    for i in range(0,116):
        if usable_roi[i]:
            roi_i=brain[i*115:(i+1)*115]
            for j in range(i+1,116):
                if usable_roi[j]:
                    roi_j=brain[(j)*115:(j+1)*115]
                    corr.append(pearsonr(roi_i.flatten(), roi_j.flatten())[0])
    return corr


In [None]:
corr_df_train=pd.DataFrame(m_ricci,columns=columns).fillna(0)
corr_df_train['y']=target_values

In [None]:
corr_df_train.shape

In [None]:
corr_df_train

# Train

In [None]:
X_train, X_test, y_train, y_test = train_test_split(corr_df_train.drop(columns='y'), corr_df_train['y'], test_size=0.25, random_state=42)

In [None]:
transformer = KernelPCA(n_components=24, kernel='poly',degree=3)
X_train_transformed = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)
X_train_transformed.shape

## SVR

In [None]:
params = [{'kernel' : ['poly'],
        'C' : [1,5,10,12,15,20,25],
        'degree' : [2,3,4],
        'coef0' : [0.01,0.02,0.5],
        'gamma' : ['auto','scale'],
        'epsilon':[0.02,0.05,0.1]}]

In [None]:
svr_reg=SVR()
grids = GridSearchCV(svr_reg,params,cv=5,verbose=5,n_jobs=-1)
grids.fit(X_train_transformed,y_train)

In [None]:
grids.best_params_

In [None]:
y_pred=grids.predict(X_test_transformed)
mean_squared_error(y_test, y_pred,squared=False)

In [None]:
transformer = KernelPCA(n_components=24, kernel='poly',degree=3)
corr_df_train_transformed = transformer.fit_transform(corr_df_train.drop(columns='y'))
svr_reg=SVR(**grids.best_params_)
svr_reg.fit(corr_df_train_transformed,corr_df_train['y'])

# Test 

In [None]:
test_df=pd.read_csv('../input/statistical-learning-sapienza-spring-2021/test.csv')

In [None]:
corr_test=test_df.progress_apply(lambda row:get_corr(row,usable_roi,test=True),axis=1)

In [None]:
test_final_df=pd.DataFrame(np.array(corr_test.values.tolist())).fillna(0)

In [None]:
test_final_df_transformed=transformer.transform(test_final_df)
y_pred_test=svr_reg.predict(test_final_df_transformed)

In [None]:
plt.hist(y_pred_test)

In [None]:
y_pred_test

In [None]:
df_leaderboard=pd.DataFrame({'id':test_df['id'],'target':y_pred_test})

In [None]:
df_leaderboard.to_csv('G14_pca',index=False)