# Import libraries

In [1]:
# General
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Supress warnings
import warnings
warnings.filterwarnings("ignore")

# Classification
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier

# Regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,ElasticNet,LogisticRegression
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

# Modelling Helpers:
from sklearn.preprocessing import Imputer, Normalizer, scale
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score, ShuffleSplit, cross_validate

# Preprocessing
from sklearn.preprocessing import MinMaxScaler, StandardScaler, Imputer, LabelEncoder

# Evaluation metrics for Regression 
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error
# Evaluation metrics for Classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

print("Setup complete...")

Setup complete...


# Load data

In [6]:
raw_csv_data = pd.read_csv('Toddler Autism dataset July 2018.csv')
raw_csv_data.shape

(1054, 19)

# Data Preparation

In [7]:
data = raw_csv_data.copy()

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1054 entries, 0 to 1053
Data columns (total 19 columns):
Case_No                   1054 non-null int64
A1                        1054 non-null int64
A2                        1054 non-null int64
A3                        1054 non-null int64
A4                        1054 non-null int64
A5                        1054 non-null int64
A6                        1054 non-null int64
A7                        1054 non-null int64
A8                        1054 non-null int64
A9                        1054 non-null int64
A10                       1054 non-null int64
Age_Mons                  1054 non-null int64
Qchat-10-Score            1054 non-null int64
Sex                       1054 non-null object
Ethnicity                 1054 non-null object
Jaundice                  1054 non-null object
Family_mem_with_ASD       1054 non-null object
Who completed the test    1054 non-null object
Class/ASD Traits          1054 non-null object
dtypes: int64

### Remove unnesscary columns

In [10]:
data.drop(['Case_No','Who completed the test','Qchat-10-Score'],axis=1, inplace=True)

In [11]:
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Sex,Ethnicity,Jaundice,Family_mem_with_ASD,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,f,middle eastern,yes,no,No
1,1,1,0,0,0,1,1,0,0,0,36,m,White European,yes,no,Yes
2,1,0,0,0,0,0,1,1,0,1,36,m,middle eastern,yes,no,Yes
3,1,1,1,1,1,1,1,1,1,1,24,m,Hispanic,no,no,Yes
4,1,1,0,1,1,1,1,1,1,1,20,f,White European,no,yes,Yes


### Ethnicity

In [13]:
data['Ethnicity'].unique()

array(['middle eastern', 'White European', 'Hispanic', 'black', 'asian',
       'south asian', 'Native Indian', 'Others', 'Latino', 'mixed',
       'Pacifica'], dtype=object)

In [14]:
data['Ethnicity'].value_counts()

White European    334
asian             299
middle eastern    188
south asian        60
black              53
Hispanic           40
Others             35
Latino             26
Pacifica            8
mixed               8
Native Indian       3
Name: Ethnicity, dtype: int64

In [16]:
# get dummies
ethnicity_columns = pd.get_dummies(data['Ethnicity'])
ethnicity_columns.head()

Unnamed: 0,Hispanic,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0


In [17]:
# check dummy variables
ethnicity_columns['check'] = ethnicity_columns.sum(axis=1)
print(ethnicity_columns['check'].sum(axis=0))
print(ethnicity_columns['check'].unique())

1054
[1]


In [18]:
ethnicity_columns = ethnicity_columns.drop(['check'], axis = 1)
ethnicity_columns.head()

Unnamed: 0,Hispanic,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,0


In [19]:
ethnicity_columns = pd.get_dummies(data['Ethnicity'], drop_first = True)
ethnicity_columns.head()

Unnamed: 0,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0


In [20]:
# Group ethnicty
data = data.drop(['Ethnicity'],axis = 1)
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,Age_Mons,Sex,Jaundice,Family_mem_with_ASD,Class/ASD Traits
0,0,0,0,0,0,0,1,1,0,1,28,f,yes,no,No
1,1,1,0,0,0,1,1,0,0,0,36,m,yes,no,Yes
2,1,0,0,0,0,0,1,1,0,1,36,m,yes,no,Yes
3,1,1,1,1,1,1,1,1,1,1,24,m,no,no,Yes
4,1,1,0,1,1,1,1,1,1,1,20,f,no,yes,Yes


In [21]:
data = pd.concat([data, ethnicity_columns], axis = 1)
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0


### One hot coding

In [22]:
le = LabelEncoder()
columns = ['Family_mem_with_ASD','Class/ASD Traits ','Sex','Jaundice']
for col in columns:
    data[col] = le.fit_transform(data[col])
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0


### Normalize data

In [28]:
#first scale the variables
from sklearn.preprocessing import StandardScaler
scaler= StandardScaler()
x_scaled = data[['Age_Mons']]
scaler.fit(x_scaled)
scaled_features = scaler.transform(x_scaled)
scaled_features[0:5]

array([[ 0.01665219],
       [ 1.01958993],
       [ 1.01958993],
       [-0.48481667],
       [-0.98628554]])

In [29]:
data['Age_Mons'] = scaled_features[:,0]
data.head()

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,...,Latino,Native Indian,Others,Pacifica,White European,asian,black,middle eastern,mixed,south asian
0,0,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1,1,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,0,0,0,0,1,1,0,1,...,0,0,0,0,0,0,0,1,0,0
3,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,1,1,0,1,1,1,1,1,1,1,...,0,0,0,0,1,0,0,0,0,0


In [36]:
data.to_csv('df-preprocessed.csv')