# Data analysis - `scikit-learn`

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Create dataset

In [None]:
def read_dataset(filename, outlier_thr=5):

    # Read csv file
    df = pd.read_csv(filename)

    # extract relevant variables
    sub_id = df['PAC_ID']
    df = df.drop('PAC_ID', 1)
    header = df.keys()
    
    # Clean dataset - drop subjects with values above `outlier_thr` STD
    outliers = np.sum((np.abs(zscore(df)) > outlier_thr), 1) != 0
    print('%d outliers detected.' % outliers.sum())
    data = np.array(df.drop(np.where(outliers)[0]))
    sub_id = sub_id[np.invert(outliers)]
    
    # zscore data
    data = zscore(data)

    # Reset Gender and Scanner values to nominal values
    data[:,0] = (data[:,0]>0) + 1
    data[:,2] = (data[:,2]>0) + 1
    data[:,4] = [np.where(i==np.unique(data[:,4]))[0][0] + 1for i in data[:,4]]

    return pd.DataFrame(data, columns=header), sub_id

In [None]:
data, sub_id = read_dataset('data/PAC2018_Covariates_detailed.csv', outlier_thr=5)
data.head()

8 outliers detected.


Unnamed: 0,Label,Age,Gender,TIV,Scanner,Tvoxels,Tmean,Tmedian,Tstd,Tmax,...,Right_Cerebral_White_Matter,Right_Cerebral_Cortex,Right_Lateral_Ventricle,Right_Thalamus,Right_Caudate,Right_Putamen,Right_Pallidum,Right_Hippocampus,Right_Amygdala,Right_Accumbens
0,1.0,1.610405,1.0,1.479924,2.0,-0.890455,1.091014,1.080367,1.034026,0.060878,...,1.09375,1.094903,1.037705,-0.896948,-0.25515,-0.72469,-0.269052,1.791264,1.591691,0.091529
1,1.0,-1.146076,1.0,-0.052883,1.0,1.134709,-0.118805,-0.147065,0.290905,0.904814,...,-0.060199,-0.013714,0.079673,0.634875,1.097503,0.176912,-0.143525,0.223282,0.283818,-0.11087
2,1.0,-0.200997,2.0,-0.322187,2.0,-0.910314,0.213228,0.466651,-0.390523,0.488056,...,0.167886,0.162364,0.443724,0.107536,-0.337023,-0.458773,-0.199329,0.879846,0.603997,0.182364
3,1.0,-0.200997,1.0,1.526994,3.0,-0.888128,1.178024,1.141739,1.173458,0.863138,...,1.022908,1.00087,2.157792,1.573471,2.83514,1.080137,0.815276,0.185643,0.730864,2.0102
4,1.0,2.004188,2.0,-0.93437,1.0,1.02757,-2.074238,-2.049584,-2.070608,-0.824733,...,-2.148225,-2.165083,-1.193547,-1.593558,-2.270355,-1.290505,-1.337832,-1.919742,-2.127827,-1.856324


# `sklearn`

In [None]:
from sklearn.model_selection import cross_val_score

In [None]:
def crossvalidation(data, clf):
    
    clf = clf(class_weight="balanced")

    for r in [3, 6, 9]:
        print('CV: %02d' % r)
        
        for query in ['Scanner!=0', 'Scanner!=1', 'Scanner==1']:

            data_skl = data.query(query)
            y = data_skl['Label']
            X = np.array(data_skl.drop('Label', 1))

            res = cross_val_score(clf, X, y, cv=r)  # accuracy
            print('\t%s\t%s' % (query, np.round(res.mean() * 100, 3)))

In [None]:
from sklearn.svm import LinearSVC
crossvalidation(data, LinearSVC)

CV: 03
	Scanner!=0	60.033
	Scanner!=1	54.704
	Scanner==1	57.04
CV: 06
	Scanner!=0	61.937
	Scanner!=1	56.561
	Scanner==1	58.496
CV: 09
	Scanner!=0	60.812
	Scanner!=1	53.763
	Scanner==1	56.915


In [None]:
from sklearn.svm import NuSVC
crossvalidation(data, NuSVC)

CV: 03
	Scanner!=0	60.258
	Scanner!=1	54.914
	Scanner==1	65.483
CV: 06
	Scanner!=0	59.866
	Scanner!=1	55.329
	Scanner==1	65.843
CV: 09
	Scanner!=0	59.811
	Scanner!=1	55.225
	Scanner==1	65.486


In [None]:
from sklearn.linear_model import LogisticRegression
crossvalidation(data, LogisticRegression)

CV: 03
	Scanner!=0	59.978
	Scanner!=1	55.426
	Scanner==1	55.938
CV: 06
	Scanner!=0	61.601
	Scanner!=1	55.217
	Scanner==1	59.109
CV: 09
	Scanner!=0	61.318
	Scanner!=1	54.066
	Scanner==1	57.405


In [None]:
from sklearn.ensemble import RandomForestClassifier
crossvalidation(data, RandomForestClassifier)

CV: 03
	Scanner!=0	60.594
	Scanner!=1	57.908
	Scanner==1	69.277
CV: 06
	Scanner!=0	60.983
	Scanner!=1	53.049
	Scanner==1	70.015
CV: 09
	Scanner!=0	61.541
	Scanner!=1	56.451
	Scanner==1	70.014
