In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split,KFold,cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,classification_report

np.random.seed(21)

## Loading Train, Validate and Test Data 

In [None]:
data_train = pd.read_csv('../input/autism-prediction/Autism-prediction/train.csv')
data_test = pd.read_csv('../input/autism-prediction/Autism-prediction/test.csv')

## Detailed EDA

In [None]:
#Test and Train Dataset Shape
data_train.shape
data_test.shape

There are 800 datapoints with 21 features and 1 target variable in the train dataset.
There are 200 datapoints with 21 features in the test dataset

In [None]:
data_train.head()
data_test.head()

In [None]:
data_train.dtypes.value_counts()
data_train.isnull().sum()

From above we can see that, there are -
- 12 int type features
- 8 object type features
- 2 float type features

**Class/ASD** is the Target Variable.
And no NaN Values in the dataset.

In [None]:
#Analyzing 'Class/ASD' for bias in the dataset
data_train['Class/ASD'].value_counts().plot.bar()

From above output, we can say that the data is biased.

As we have 615 datapoints for Class '0' (No) and only 185 datapoints for Class '1' (Yes), we can say that the data is Imbalanced.

And before building the model we will have to oversample the minority Class or apply SMOTE techniques. (I will cover this in my next notebook along with Model Building)


Lets look at the top and bottom 5 rows of the train data, to understand the features 
and there possible impact on the model.

In [None]:
#top 5 rows of the train dataset
data_train.head()

nan

In [None]:
#top 5 rows of the 'int64' type fetaures of the test dataset
data_train.select_dtypes(include='int64').head()

From above we can see that, all the 'int64' type features have binary values (0 or 1), except for ID, which is unique identification for patient. 
Since it is not required for model building. Let us drop it from both train and test datasets.


In [None]:
data_train.drop(columns='ID', axis=1,inplace=True)
test_df = data_test.drop(columns='ID')

Let's look at the A1 to A10 Scores and there possible relation with target column 'Class/ASD'.

In [None]:
data_train.groupby('A1_Score')['Class/ASD'].mean().plot.bar()

Patients with higer score are more likely to have Autism.

In [None]:
nan

In [None]:
#top 5 rows of the 'int64' type fetaures of the test dataset
data_train.select_dtypes(include='object').head()

From above output we can see that- 

**gender,used_app_before** doesn't seem to be features with any importance, lets drop these columns.

**jaundice, autism** are the binary variables - We will have to encode these before bulding the model.

**ethnicity,contry_of_res,age_desc,relation** are nominal variables



In [None]:
data_train.drop(columns=['gender','used_app_before'], axis=1,inplace=True)
test_df = test_df.drop(columns=['gender','used_app_before'], axis=1)

Let us analyze the distribution of **ethnicity, contry_of_res and age_desc**

In [None]:
data_train['ethnicity'].value_counts().plot.bar()

In [None]:
data_train.groupby('ethnicity')['Class/ASD'].mean().sort_values().plot.bar()

There are 151 rows with '?' as the value. We will have to impute this before building the model. 
Also, Majority of the 'White-European' are diagnosed with Autism.

In [None]:
data_train['contry_of_res'].value_counts()
data_train[data_train['contry_of_res']== '?'].value_counts()

In [None]:
data_train['age_desc'].value_counts().plot.bar()

**age_desc** column has 0 variance. Hence, this doesn't add any value to model building and can be dropped before building the model. 

In [None]:
data_train.drop(columns='age_desc', axis=1,inplace=True)
test_df = test_df.drop(columns='age_desc')

In [None]:
data_train['relation'].value_counts().plot.bar()

In [None]:
data_train.groupby('relation')['Class/ASD'].mean().plot.bar()
plt.title('Relation of patient who completed the test')

There are 77 rows with '?' as the value. We will have to impute this (with mode) before building the model. Let us impute '?' with Others.
Note - I think this feature is not important for model building, as we already have 'austim' column to checkl for immediate family member. There fore, once I will build the model including this feature and then will check my model performance without this feature.

In [None]:
#data_train.loc[data_train.ethnicity == '?', 'ethnicity'] = 'Others'
#test_df.loc[test_df.ethnicity == 'others', 'ethnicity'] = 'Others'
#data_train.loc[data_train.ethnicity == '?', 'ethnicity'] = 'Others'
#test_df.loc[test_df.ethnicity == 'others', 'ethnicity'] = 'Others'

In [None]:
data_train.groupby('austim')['Class/ASD'].mean().plot.bar()
plt.title('Imediate family member diagnosed')

In [None]:
data_train.groupby('jaundice')['Class/ASD'].mean().plot.bar()
plt.title('Whether the patient had jaundice at the time of birth')

In [None]:
data_train.drop(columns=['ethnicity','contry_of_res', 'relation'], axis=1,inplace=True)
test_df = test_df.drop(columns=['ethnicity','contry_of_res', 'relation'], axis=1)

nan

From the above EDA it is clear that -
- Person with higher scores are likely to have autism.
- Person who had jaundice at the time of brith are more likely to have autism.
- Person whoes immediate family member has autism is more likely to have autism.

## Modelling 


nan

#### First we will create a model with the imbalanced data

In [None]:
#Encoding the binary variables
mapping = { 'yes' : 1 , 'no' : 0}

for data in [data_train, test_df]:
    data['jaundice']=data['jaundice'].replace(mapping).astype(float)
    data['austim']=data['austim'].replace(mapping).astype(float)

In [None]:
#Defining feature and Target variables for training model
X = data_train.drop('Class/ASD', axis=1)
y = data_train['Class/ASD']

In [None]:
#Print shape of X and y 
print("Shape of X is {}, and shape of y is {}".format(X.shape, y.shape))

In [None]:
#Splitting train and test data
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.3)

In [None]:
#instantiating the Random Forest Classifier with n_estimator as 150
rfc_model = RandomForestClassifier(n_estimators=150)

In [None]:
#Fitting the model
rfc_model.fit(X_train,y_train)

In [None]:
#Predicting the y_pred_test
y_pred_test = rfc_model.predict(X_test)

In [None]:
#Checking accuracy score, confusion matrix and classification report on test data
print("Accuracy score of RFC model on test dataset is : ")
print(accuracy_score(y_test, y_pred_test))
print(confusion_matrix(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test))

In [None]:
#Predicted values of y for test dataset
y_pred_test = rfc_model.predict(test_df)

#Creating a new dataframe with ID and predicted y values
pred_df = pd.DataFrame()
pred_df['ID'] = data_test['ID']
pred_df['Class/ASD'] = y_pred_test

#Exporting the values to a csv file
pred_df.to_csv('submission.csv', index = False )