In [81]:
import pandas as pd
from sklearn.model_selection import train_test_split

#### Data Pre-processing
1. Select only data in the United States from the original dataset
2. Reset index, rename columns, and look through the dataframe
3. Store the data in the United States into a csv file
4. Split the U.S. dataset into training and testing dataset (1000:250), others stored to use for demo

##### 1. Select only data in the United States from the original dataset

In [82]:
data = pd.read_csv('alzheimers_prediction_dataset.csv')
usa = data[data['Country'] == 'USA']
print(len(data[data['Country'] == 'USA']))

3616


##### 2. Reset index, rename columns, and look through the dataframe

In [83]:
# Check dataset
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.width', 100)
print('--------- Head of the dataframe ---------\n', usa.head())
print('\n-------- The shape of the usa dataset --------\n', usa.shape)
print('\n-------- Columns of the usa dataset --------\n',usa.columns)

--------- Head of the dataframe ---------
    Country  Age  Gender  Education Level   BMI Physical Activity Level Smoking Status Alcohol Consumption Diabetes Hypertension Cholesterol Level Family History of Alzheimer’s  Cognitive Test Score Depression Level Sleep Quality Dietary Habits Air Pollution Exposure Employment Status Marital Status Genetic Risk Factor (APOE-ε4 allele) Social Engagement Level Income Level Stress Levels Urban vs Rural Living Alzheimer’s Diagnosis
16     USA   61  Female               13  33.1                  Medium         Former               Never       No          Yes              High                            No                    51             High          Good        Healthy                   High          Employed        Married                                   No                    High          Low           Low                 Urban                    No
75     USA   55    Male               16  29.9                  Medium         Former        

In [84]:
# Reset index
usa.reset_index(drop = True, inplace = True)
usa.index = range(1, len(usa) + 1) 

In [85]:
# Rename column: 'Alzheimer’s Diagnosis'
usa.rename(columns = {"Alzheimer’s Diagnosis": "Alzheimer's Diagnosis"}, inplace = True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  usa.rename(columns = {"Alzheimer’s Diagnosis": "Alzheimer's Diagnosis"}, inplace = True)


##### 3. Store the data in the United States into a csv file

In [86]:
usa.to_csv('alzheimers_prediction_dataset_usa.csv') # replace the old csv with the new one if the file already exists

##### 4. Split the U.S. dataset into training and testing dataset (1000:250), others stored to use for demo

In [87]:
print(usa.groupby("Alzheimer's Diagnosis").size())

Alzheimer's Diagnosis
No     2211
Yes    1405
dtype: int64


In [95]:
train_df, remaining_df = train_test_split(usa, train_size = 1000, stratify = usa['Alzheimer\'s Diagnosis'], random_state = 33)
test_df, demo_df = train_test_split(remaining_df, train_size = 250, stratify = remaining_df['Alzheimer\'s Diagnosis'], random_state = 33)

In [96]:
print("-------- Original dataset's yes/no count --------")
print(usa['Alzheimer\'s Diagnosis'].value_counts(normalize=True))

print("\n-------- Training dataset's yes/no count --------")
print(train_df['Alzheimer\'s Diagnosis'].value_counts(normalize=True))

print("\n-------- Testing dataset's yes/no count --------")
print(test_df['Alzheimer\'s Diagnosis'].value_counts(normalize=True))

-------- Original dataset's yes/no count --------
Alzheimer's Diagnosis
No     0.611449
Yes    0.388551
Name: proportion, dtype: float64

-------- Training dataset's yes/no count --------
Alzheimer's Diagnosis
No     0.611
Yes    0.389
Name: proportion, dtype: float64

-------- Testing dataset's yes/no count --------
Alzheimer's Diagnosis
No     0.612
Yes    0.388
Name: proportion, dtype: float64


In [97]:
print("Size of training set", train_df.shape[0])
print("Size of testing set:", test_df.shape[0])
print("Size of remaining dataset for demo", demo_df.shape[0])

Size of training set 1000
Size of testing set: 250
Size of remaining dataset for demo 2366


In [98]:
train_df.to_csv('training_dataset.csv')
test_df.to_csv('testing_dataset.csv')
demo_df.to_csv('demo_dataset.csv')