## Can Alzheimer be predicted?

### 1. Sourcing and Loading 


#### 1.1. Importing Libraries

In [22]:
# import the pandas, numpy libraries as pd, and np respectively. 
import pandas as pd
import numpy as np

# Load the pyplot collection of functions from matplotlib, as plt 
import matplotlib.pyplot as plt

#### 1.2.  Loading the data
This MRI data sets has been taken from Open Access Series of Imaging Studies (OASIS)
which is a project aimed at making MRI data sets of the brain freely available to the
scientific community. OASIS is made available by the Washington University Alzheimer’s
Disease Research Center, Dr. Randy Buckner at the Howard Hughes Medical Institute (HHMI)
(at Harvard University, the Neuroinformatics Research Group (NRG) at Washington University
School of Medicine,and the Biomedical Informatics Research Network (BIRN).

In [23]:
# First, make a variable called Cross-sec, and assign it to cross-sectional 
# collection of 416 subjects
Cross_sec = pd.read_csv('Data/oasis_cross-sectional.csv', index_col= None)
# Second, make a second variable called Long-sec, and assign it to longitudinal
#collection of 150 subjects
Long_sec = pd.read_csv('Data/oasis_longitudinal.csv', index_col= None)

### 2. Cleaning, transforming, and visualizing

**2.1. Exploring the data** 


In [24]:
Cross_sec.head()

Unnamed: 0,ID,M/F,Hand,Age,Educ,SES,MMSE,CDR,eTIV,nWBV,ASF,Delay
0,OAS1_0001_MR1,F,R,74,2.0,3.0,29.0,0.0,1344,0.743,1.306,
1,OAS1_0002_MR1,F,R,55,4.0,1.0,29.0,0.0,1147,0.81,1.531,
2,OAS1_0003_MR1,F,R,73,4.0,3.0,27.0,0.5,1454,0.708,1.207,
3,OAS1_0004_MR1,M,R,28,,,,,1588,0.803,1.105,
4,OAS1_0005_MR1,M,R,18,,,,,1737,0.848,1.01,


In [25]:
Long_sec.head()

Unnamed: 0,Subject ID,MRI ID,Group,Visit,MR Delay,M/F,Hand,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,ASF
0,OAS2_0001,OAS2_0001_MR1,Nondemented,1,0,M,R,87,14,2.0,27.0,0.0,1987,0.696,0.883
1,OAS2_0001,OAS2_0001_MR2,Nondemented,2,457,M,R,88,14,2.0,30.0,0.0,2004,0.681,0.876
2,OAS2_0002,OAS2_0002_MR1,Demented,1,0,M,R,75,12,,23.0,0.5,1678,0.736,1.046
3,OAS2_0002,OAS2_0002_MR2,Demented,2,560,M,R,76,12,,28.0,0.5,1738,0.713,1.01
4,OAS2_0002,OAS2_0002_MR3,Demented,3,1895,M,R,80,12,,22.0,0.5,1698,0.701,1.034


**2.2. Cleaning the data**

In [26]:
Cross_sec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 436 entries, 0 to 435
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   ID      436 non-null    object 
 1   M/F     436 non-null    object 
 2   Hand    436 non-null    object 
 3   Age     436 non-null    int64  
 4   Educ    235 non-null    float64
 5   SES     216 non-null    float64
 6   MMSE    235 non-null    float64
 7   CDR     235 non-null    float64
 8   eTIV    436 non-null    int64  
 9   nWBV    436 non-null    float64
 10  ASF     436 non-null    float64
 11  Delay   20 non-null     float64
dtypes: float64(7), int64(2), object(3)
memory usage: 41.0+ KB


In [27]:
Long_sec.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 373 entries, 0 to 372
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Subject ID  373 non-null    object 
 1   MRI ID      373 non-null    object 
 2   Group       373 non-null    object 
 3   Visit       373 non-null    int64  
 4   MR Delay    373 non-null    int64  
 5   M/F         373 non-null    object 
 6   Hand        373 non-null    object 
 7   Age         373 non-null    int64  
 8   EDUC        373 non-null    int64  
 9   SES         354 non-null    float64
 10  MMSE        371 non-null    float64
 11  CDR         373 non-null    float64
 12  eTIV        373 non-null    int64  
 13  nWBV        373 non-null    float64
 14  ASF         373 non-null    float64
dtypes: float64(5), int64(5), object(5)
memory usage: 43.8+ KB


#### Check if data are all for right hand or left hand or both? 

In [28]:
Cross_sec['Hand'].unique()

array(['R'], dtype=object)

In [29]:
Long_sec['Hand'].unique()

array(['R'], dtype=object)

So all the data are from right hand poeple, thefore we do not need to keep this column

In [30]:
Cross_sec = Cross_sec.drop(['Hand'], axis = 1)

In [31]:
Long_sec = Long_sec.drop(['Hand'], axis = 1)

In [32]:
Long_sec['M/F'].value_counts()

F    213
M    160
Name: M/F, dtype: int64

#### Counts the number of subjects scanned for Longitudinal and cross sectional 

In [33]:
Cross_sec['ID'].value_counts().sum()

436

In [34]:
Long_sec['Subject ID'].value_counts().sum()

373

#### Renaming the similar columns in both group to the same name 

In [35]:
Cross_sec.columns

Index(['ID', 'M/F', 'Age', 'Educ', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF',
       'Delay'],
      dtype='object')

In [36]:
Long_sec.columns

Index(['Subject ID', 'MRI ID', 'Group', 'Visit', 'MR Delay', 'M/F', 'Age',
       'EDUC', 'SES', 'MMSE', 'CDR', 'eTIV', 'nWBV', 'ASF'],
      dtype='object')

In [42]:
Cross_sec = Cross_sec.rename(columns={'ID':'Subject ID'})

In [44]:
Long_sec = Long_sec.rename(columns={'EDUC':'Educ'})

#### Finding the number of Null values for differnt features 

In [48]:
Cross_sec.isna().sum()

Subject ID      0
M/F             0
Age             0
Educ          201
SES           220
MMSE          201
CDR           201
eTIV            0
nWBV            0
ASF             0
Delay         416
dtype: int64

In [49]:
Long_sec.isna().sum()

Subject ID     0
MRI ID         0
Group          0
Visit          0
MR Delay       0
M/F            0
Age            0
Educ           0
SES           19
MMSE           2
CDR            0
eTIV           0
nWBV           0
ASF            0
dtype: int64

#### Filling the NAN value with the median for each column

In [58]:
Cross_sec['Educ'].fillna(Cross_sec['Educ'].median(), inplace=True)

In [60]:
Cross_sec['SES'].fillna(Cross_sec['SES'].median(), inplace=True)

In [62]:
Long_sec['SES'].fillna(Long_sec['SES'].median(), inplace=True)

In [61]:
Cross_sec = Cross_sec.drop(['Delay'], axis = 1)