# Scaling and Encoding OASIS 1

##  Imports

In [1]:
# DATA MANIPULATION
import pandas as pd
import numpy as np

# DATA VISUALISATION
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

# MACHINE LEARNING
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

###  Dataset

In [2]:
oasis1_cs_path = '../raw_data/OASIS1/oasis_cross-sectional.csv'

In [3]:
oasis1 = pd.read_csv(oasis1_cs_path)

Note to self:


**CDR;** 0= nondemented; 0.5 – very mild dementia; 1 = mild dementia; 2 = moderate dementia

**eTIV;** Estimated total intracranial volume (eTIV)

**nWBV** Normalized whole brain volume

**ASF;** Atlas scaling factor


In [4]:
oasis1.sample(20)

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
249,OAS1_0277_MR1,M,R,22,,,,,1913,0.841,0.917,
105,OAS1_0113_MR1,F,R,83,2.0,2.0,29.0,0.0,1569,0.768,1.118,
297,OAS1_0329_MR1,F,R,80,2.0,3.0,29.0,0.5,1209,0.76,1.451,
206,OAS1_0227_MR1,F,R,26,,,,,1288,0.777,1.362,
153,OAS1_0165_MR1,F,R,74,2.0,3.0,29.0,0.0,1395,0.787,1.258,
240,OAS1_0267_MR1,M,R,80,5.0,2.0,28.0,0.5,1506,0.679,1.166,
314,OAS1_0348_MR1,F,R,22,,,,,1473,0.841,1.191,
302,OAS1_0335_MR1,F,R,80,1.0,4.0,27.0,0.5,1654,0.678,1.061,
331,OAS1_0367_MR1,F,R,46,2.0,2.0,28.0,0.0,1161,0.841,1.511,
404,OAS1_0446_MR1,F,R,80,2.0,4.0,30.0,0.0,1390,0.748,1.263,


## Scaling Numerical Features

 Note: nWBV already normalized
 
**Columns to scale:**

Standard Scale: age, ASF

Robust Scale: eTIV

In [5]:
# Instantiating scalers

standard_scaler = StandardScaler()
robust_scaler = RobustScaler()

In [6]:
features_standard = ["Age"]
features_robust = ["eTIV", "MMSE"]

In [7]:
# Creating a copy of the original dataset
oasis1_scaled = oasis1.copy()

In [8]:
oasis1_scaled[features_standard] = standard_scaler.fit_transform(oasis1_scaled[features_standard])
oasis1_scaled

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,0.897045,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,0.144298,4.0,1.0,29.0,0.0,1147,0.810,1.531,
2,OAS1_0003_MR1,F,R,0.857427,4.0,3.0,27.0,0.5,1454,0.708,1.207,
3,OAS1_0004_MR1,M,R,-0.925396,,,,,1588,0.803,1.105,
4,OAS1_0005_MR1,M,R,-1.321579,,,,,1737,0.848,1.010,
...,...,...,...,...,...,...,...,...,...,...,...,...
431,OAS1_0285_MR2,M,R,-1.242342,,,,,1469,0.847,1.195,2.0
432,OAS1_0353_MR2,M,R,-1.163106,,,,,1684,0.790,1.042,40.0
433,OAS1_0368_MR2,M,R,-1.163106,,,,,1580,0.856,1.111,89.0
434,OAS1_0379_MR2,F,R,-1.242342,,,,,1262,0.861,1.390,2.0


In [9]:
oasis1_scaled[features_robust] = standard_scaler.fit_transform(oasis1_scaled[features_robust])

## Scaling Categorical Features

In [10]:
# This code is precautionary and drops NAN in CDR columns. However, all NAN are supposed to have been dropped already.
oasis1.dropna(subset = ["CDR"], inplace=True)
oasis1_scaled.dropna(subset = ["CDR"], inplace=True)

In [11]:
ohe = OneHotEncoder(drop='if_binary', sparse = False)

ohe.fit(oasis1[['CDR']])

oasis1_scaled['nondemented'], oasis1_scaled['very_mild_dementia'], oasis1_scaled['mild_dementia'], oasis1_scaled['moderate_dementia'] = ohe.transform(oasis1_scaled[['CDR']]).T

In [12]:
oasis1.CDR.unique()

array([0. , 0.5, 1. , 2. ])

## Transforming other features

In [13]:
# Female: 0
# Male: 1
oasis1_scaled['M/F'].replace('F', 0,inplace=True)
oasis1_scaled['M/F'].replace('M', 1,inplace=True)

In [14]:
oasis1_scaled

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay,nondemented,very_mild_dementia,mild_dementia,moderate_dementia
0,OAS1_0001_MR1,0,R,0.897045,2.0,3.0,0.524850,0.0,-0.869834,0.743,1.306,,1.0,0.0,0.0,0.0
1,OAS1_0002_MR1,0,R,0.144298,4.0,1.0,0.524850,0.0,-2.112276,0.810,1.531,,1.0,0.0,0.0,0.0
2,OAS1_0003_MR1,0,R,0.857427,4.0,3.0,-0.017303,0.5,-0.176084,0.708,1.207,,0.0,1.0,0.0,0.0
8,OAS1_0010_MR1,1,R,0.897045,5.0,2.0,0.795927,0.0,0.971755,0.689,1.073,,1.0,0.0,0.0,0.0
9,OAS1_0011_MR1,0,R,0.025443,3.0,2.0,0.795927,0.0,-1.014890,0.827,1.329,,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,OAS1_0453_MR1,0,R,0.738572,1.0,4.0,0.524850,0.5,-1.178868,0.748,1.355,,0.0,1.0,0.0,0.0
412,OAS1_0454_MR1,0,R,0.857427,3.0,2.0,-1.101609,0.5,0.341074,0.730,1.142,,0.0,1.0,0.0,0.0
413,OAS1_0455_MR1,0,R,0.382008,2.0,4.0,0.253774,0.0,-0.806766,0.825,1.297,,1.0,0.0,0.0,0.0
414,OAS1_0456_MR1,1,R,0.382008,5.0,2.0,0.795927,0.0,0.978062,0.780,1.072,,1.0,0.0,0.0,0.0


In [15]:
oasis1_scaled.SES.value_counts()

2.0    65
1.0    50
3.0    49
4.0    49
5.0     3
Name: SES, dtype: int64