# DATA2 -  DESCRIPTIVE: LONG - TRAIN - TEST SETS

**Resources**
1. DATA
    1. [data/data2](https://github.com/MMIV-ML/MCI-subgroups/tree/master/data/data2) - csv files downloaded from ADNI,
    1. [data/data2_FS](https://github.com/MMIV-ML/MCI-subgroups/tree/master/data/data2_FS) - FS result csv files,
    1. [data_zip/20201109_data2_file_versions](https://github.com/MMIV-ML/MCI-subgroups/blob/master/data/data_zip/20201109_data2_file_versions.pdf) - download ADNI web page screen shoot taken for downaloaded csv files (some of file neames are printed with dates),
1. RESULTS
    1. [results/20201110](https://github.com/MMIV-ML/MCI-subgroups/tree/master/results/20201110) - result folder,
    1. [results/20201120/README.md](https://github.com/MMIV-ML/MCI-subgroups/blob/master/results/20201110/README.md) - short description,
1. GOOGLE DRIVE
    1. [slides](https://docs.google.com/presentation/d/1aEK7M5BPe0RxWYskzQCEDbT4Mf-4mRRqQ5uJ_YuqRzg/edit#slide=id.p) - link to google slides (**OUT OF DATE, MUST BE UPDATED !!!**),
    1. [GoogleDrive](https://drive.google.com/drive/folders/1r8l2R88-0T8Xahk30iAgjBCWXvV1R2J-) - main google drive slide folder,
1. GIT HUB
    1. [GitHub](https://github.com/MMIV-ML/MCI-subgroups) - main repo folder,
1. FEATURES
    1. Included features ([local](../results/20201110/features_included.csv), [GitHub](https://github.com/MMIV-ML/MCI-subgroups/blob/master/results/20201110/features_included.csv)) - a file with a feature list included in each subproject (sMCI-cAD, baseline) 
    1. Features to include ([local](../results/20201110/features_to_include.csv), [GitHub](https://github.com/MMIV-ML/MCI-subgroups/blob/master/results/20201110/features_to_include.csv))- a least of featrues from different csv files to combine with ADNIMERGE  



The latest changes (ver. 0.02):

    - extracted the `train` set from loaded csv files
    
   *Created: 2021.03.09 / Updated: 2021.03.31

---

#### IMPORTS

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

#### MCI MODULES

In [3]:
# our imports 
import mci_get as mget
import mci_info as minfo
import mci_utils as mutils

# DEFAULT VALUE: 6
pd.options.display.precision = 2
#pd.set_option('display.max_columns', None)

#### GLOBAL VARIABLES

In [4]:
# Notebook folder
NB_DIR = %pwd
NB_DIR = Path(NB_DIR)
# Root MCI foler
ROOT_DIR = NB_DIR.parent
# Data folder
DATA_DIR = ROOT_DIR/'data/data2'
# Results folder
RESULTS_DIR = ROOT_DIR/'results'
#RESULTS_DIR.mkdir(parents=True, exist_ok=True)

#### INSTALLED PACKAGES INFO

In [5]:
if 1:
    packages = ['numpy', 'scipy', 'seaborn', 'pandas', 'matplotlib', 'statsmodels', 'ipywidgets', 'eli5', 'pdpbox']
    display(mutils.package_versions(installedOnly=False, theMostImportant=packages))



Computer name: sony
Operating system: Linux, 64bit

Python path: /home/marek/miniconda3/envs/py37/bin/python
Python version: 3.7.7 (default, Mar 26 2020, 15:48:22) 
[GCC 7.3.0]



Unnamed: 0,module,version
4,eli5,0.11.0
6,ipywidgets,7.6.3
9,matplotlib,3.3.4
11,numpy,1.19.2
13,pandas,1.2.0
14,pdpbox,0.2.1
18,scipy,1.5.2
19,seaborn,0.11.1
22,statsmodels,0.11.1


---

# LOAD FILES

In [6]:
long_all = pd.read_csv(RESULTS_DIR / '20201110' / 'long.csv', index_col=0, low_memory=False)
bl_all = pd.read_csv(RESULTS_DIR / '20201110' / 'bl.csv', index_col=0, low_memory=False)
print(f'Rows:\n- long_all:\t{long_all.shape[0]}\n- bl_all: \t{bl_all.shape[0]}')

Rows:
- long_all:	6327
- bl_all: 	708


### SELECT TRAIN SUBSET 

In [7]:
long = long_all.loc[long_all.Usage_ == 'train']
bl = bl_all.loc[bl_all.Usage_ == 'train']
print(f'Rows:\n- long:\t{long.shape[0]}\n- bl:  \t{bl.shape[0]}')

Rows:
- long:	5021
- bl:  	566


### COLUMN SELECTION FROM ADNIMERGE

In [8]:
cols_scores = ['RID', 'AGE', 'PTGENDER', 'PTEDUCAT', 'ADAS13_adni', 'TOTAL13_adas']
cols_nrs = [ 'ADAS13_adni_Nr_', 'TOTAL13_adas_Nr_', 'MRIs_Nr_', 'Visits_Nr_', 'Age_rounded_', 'Participation_length_yr_']

cols = cols_scores + cols_nrs

---

# LONG & BASELINE (BL)

In [9]:
print(long.shape)
print(long.columns)

(5021, 88)
Index(['RID', 'PTID', 'PTGENDER', 'PTEDUCAT', 'EXAMDATE', 'AGE', 'Years_bl',
       'Month', 'Month_bl', 'DX', 'DX_bl', 'ORIGPROT', 'Phase', 'IMAGEUID',
       'ABETA', 'ADAS13_adni', 'APOE4', 'CDRSB', 'FAQ', 'LDELTOTAL', 'MMSE',
       'PTAU', 'PTETHCAT', 'PTRACCAT', 'RAVLT_forgetting', 'RAVLT_immediate',
       'RAVLT_learning', 'RAVLT_perc_forgetting', 'TAU', 'TRABSCOR_adni',
       'MERGE_long_adas', 'Q10_adas', 'Q11_adas', 'Q12_adas', 'Q13_adas',
       'Q1_adas', 'Q2_adas', 'Q3_adas', 'Q4_adas', 'Q5_adas', 'Q6_adas',
       'Q7_adas', 'Q8_adas', 'Q9_adas', 'TOTAL13_adas', 'VISCODE3_adas',
       'ANARTERR_neuro', 'AVDEL30MIN_neuro', 'AVDELTOT_neuro', 'AVTOT6_neuro',
       'AVTOTB_neuro', 'CATANIMSC_neuro', 'CLOCKSCOR_neuro', 'COPYSCOR_neuro',
       'EXAMDATE_neuro', 'MERGE_long_neuro', 'Phase_neuro', 'TRAASCOR_neuro',
       'TRABSCOR_neuro', 'VISCODE2_neuro', 'EXAMDATE_gds', 'GDTOTAL_gds',
       'MERGE_long_gds', 'Phase_gds', 'VISCODE2_gds',
       'Left-Lateral-Ve

In [10]:
minfo.included_feature_info(long, pattern='adni-adas-neuro-gds-cross-long-_')

Total number of columns: 88



Unnamed: 0,adni (#30),adas (#16),neuro (#14),gds (#5),long (#4),cross (#4),ours (#15)
0,ABETA,MERGE_long_adas,ANARTERR_neuro,EXAMDATE_gds,Left-Lateral-Ventricle_long,Left-Lateral-Ventricle_cross,ABETA_
1,ADAS13_adni,Q10_adas,AVDEL30MIN_neuro,GDTOTAL_gds,Right-Lateral-Ventricle_long,Right-Lateral-Ventricle_cross,ADAS13_adni_Nr_
2,AGE,Q11_adas,AVDELTOT_neuro,MERGE_long_gds,eTIV_x_long,eTIV_x_cross,Age_at_scan_
3,APOE4,Q12_adas,AVTOT6_neuro,Phase_gds,eTIV_y_long,eTIV_y_cross,Age_at_scan_rounded_
4,CDRSB,Q13_adas,AVTOTB_neuro,VISCODE2_gds,,,Age_bin_
5,DX,Q1_adas,CATANIMSC_neuro,,,,Age_rounded_
6,DX_bl,Q2_adas,CLOCKSCOR_neuro,,,,Idx_
7,EXAMDATE,Q3_adas,COPYSCOR_neuro,,,,Imageuid_
8,FAQ,Q4_adas,EXAMDATE_neuro,,,,MRIs_Nr_
9,IMAGEUID,Q5_adas,MERGE_long_neuro,,,,Participation_length_yr_


---

### LONG

In [11]:
minfo.df_info(long, 0, "Loaded 'long' file")

LOADED 'LONG' FILE:
	Rows(exams): 5021,
	Columns (features): 88,
	Patients number (unique RID): 566,
		Patients with at least one MRI image (MRIs): 566,
		Patients without any MRI image (MRIs): 0,
	MRI images (IMAGEUID): 3135.




In [12]:
#minfo.iterate_patient_GUI(long[cols])

In [13]:
#long[cols].describe()

### BL

In [14]:
minfo.df_info(long, 0, "Loaded 'long' file")

LOADED 'LONG' FILE:
	Rows(exams): 5021,
	Columns (features): 88,
	Patients number (unique RID): 566,
		Patients with at least one MRI image (MRIs): 566,
		Patients without any MRI image (MRIs): 0,
	MRI images (IMAGEUID): 3135.




---

# SOME STATISTICS

In [15]:
bl[cols].head(5)

Unnamed: 0,RID,AGE,PTGENDER,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
0,4,67.5,Male,10,21.33,21.33,5,5,5,6,68,3.03
13,30,80.0,Female,19,22.0,22.0,6,6,6,7,80,3.1
20,33,83.3,Male,20,25.67,25.67,6,6,6,6,83,3.09
26,38,76.8,Male,12,16.33,16.33,3,3,3,3,77,0.98
29,41,70.9,Female,14,28.33,28.33,7,7,7,8,71,4.17


In [16]:
bl[cols].describe()

Unnamed: 0,RID,AGE,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
count,566.0,566.0,566.0,565.0,565.0,566.0,566.0,566.0,566.0,566.0,566.0
mean,2420.82,73.48,15.88,17.22,17.22,6.29,6.29,5.54,8.88,73.46,4.72
std,1777.29,7.28,2.85,6.67,6.67,2.42,2.42,1.6,3.78,7.28,2.77
min,4.0,55.0,6.0,3.0,3.0,2.0,2.0,3.0,3.0,55.0,0.53
25%,856.25,68.62,14.0,12.0,12.0,5.0,5.0,4.0,6.0,69.0,2.71
50%,2081.0,74.0,16.0,17.0,17.0,6.0,6.0,5.0,8.0,74.0,4.01
75%,4391.25,79.07,18.0,22.0,22.0,7.0,7.0,6.0,11.0,79.0,6.79
max,5099.0,91.4,20.0,38.0,38.0,15.0,15.0,12.0,21.0,91.0,13.85


### SUBGROUP

In [17]:
df1 = bl.groupby(['Subgroup_'])
df1[cols].mean()

Unnamed: 0_level_0,RID,AGE,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
Subgroup_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cAD,2051.16,73.9,15.95,20.49,20.49,6.59,6.59,5.81,9.33,73.87,4.94
sMCI,2737.15,73.12,15.83,14.44,14.44,6.04,6.04,5.31,8.5,73.1,4.53


In [18]:
df1[cols].std()

Unnamed: 0_level_0,RID,AGE,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
Subgroup_,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
cAD,1789.5,6.92,2.77,6.16,6.16,2.23,2.23,1.59,3.84,6.93,2.68
sMCI,1707.09,7.57,2.92,5.77,5.77,2.55,2.55,1.58,3.68,7.55,2.83


In [19]:
df1.size()

Subgroup_
cAD     261
sMCI    305
dtype: int64

### SUBGROUP & GENDER

In [20]:
df2 = bl.groupby(['Subgroup_','PTGENDER'])
df2[cols].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,RID,AGE,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
Subgroup_,PTGENDER,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
cAD,Female,2305.05,72.49,15.35,21.3,21.3,6.31,6.31,5.54,8.88,72.47,4.78
cAD,Male,1885.66,74.81,16.34,19.96,19.96,6.77,6.77,5.98,9.63,74.78,5.04
sMCI,Female,2867.39,72.1,15.39,13.1,13.1,5.99,5.99,5.13,8.35,72.09,4.42
sMCI,Male,2650.33,73.8,16.12,15.34,15.34,6.07,6.07,5.43,8.59,73.78,4.6


In [21]:
df2[cols].std()

Unnamed: 0_level_0,Unnamed: 1_level_0,RID,AGE,PTEDUCAT,ADAS13_adni,TOTAL13_adas,ADAS13_adni_Nr_,TOTAL13_adas_Nr_,MRIs_Nr_,Visits_Nr_,Age_rounded_,Participation_length_yr_
Subgroup_,PTGENDER,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
cAD,Female,1790.61,7.26,2.69,6.54,6.54,1.85,1.85,1.37,3.41,7.3,2.49
cAD,Male,1774.91,6.55,2.76,5.86,5.86,2.44,2.44,1.7,4.09,6.55,2.8
sMCI,Female,1665.55,7.41,2.99,5.62,5.62,2.47,2.47,1.37,3.41,7.39,2.81
sMCI,Male,1733.32,7.61,2.84,5.71,5.71,2.61,2.61,1.71,3.86,7.61,2.86


In [22]:
df2.size()

Subgroup_  PTGENDER
cAD        Female      103
           Male        158
sMCI       Female      122
           Male        183
dtype: int64

---

### FIND MISSING VALUES. SMALLER THAN ZERO: (-1,-2,-3,-4)

In [23]:
df = long.select_dtypes(include=['float', 'int'])
for c in df.columns:
    mn = df[c].min()
    if mn < 0:
        print(c, df[c].min())

RAVLT_forgetting -35.0
RAVLT_learning -5.0
RAVLT_perc_forgetting -1166.67


In [30]:
#df.loc[df.TOTAL13_adas == -4, ['RID', 'TOTAL13_adas']]

In [31]:
#long.RAVLT_perc_forgetting.value_counts()