In [None]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

In [None]:
# Install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib

In [None]:
# Import dependencies
import pandas as pd
import matplotlib.pyplot as plt

 ## Read the CSV and Perform Basic Data Cleaning    

In [None]:
# Read in csv
df = pd.read_csv("../Resources/Autism.csv")

# Drop the null columns where all values are null
df = df.dropna(axis='columns', how='all')

# Drop the null rows
df = df.dropna()
df.head()

In [None]:
# Encode strings into binary format
df = pd.get_dummies(df)
pd.set_option('display.max_columns', None)
df.head()

## Select ML features    

In [None]:
# Set features. Start off with all inputs except 'Case No'
X = df.drop(columns=['Class_NO','Class_YES', 'Case No', 'Score'])
y = df[['Class_NO','Class_YES']]
print(X.shape, y.shape)

In [None]:
# Perform feature selection after grid search, and then I can run this cell, altering inputs
# NOTE: These are the features I ultimately selected after running ExtraTreeClassifier below Gridsearch
Xsel = df[['A5', 'A9', 'A6', 'A3', 'A4', 'A1', 'A2', 'A7', 'A10', 'A8']]
y = df[['Class_NO','Class_YES']]
print(Xsel.shape, y.shape)

## Create a Train Test Split     
*Use 'Class' for the y values*

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1, stratify = y)

## Pre-processing     
*Scale the data using the MinMaxScaler*

In [None]:
# Scale the data
from sklearn.preprocessing import MinMaxScaler

#X
X_scaler = MinMaxScaler().fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

#y
y_scaler = MinMaxScaler().fit(y_train)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [None]:
# Loop through different k values to see which has the highest accuracy

#KNN
from sklearn.neighbors import KNeighborsClassifier

# Set up empty arrays for our training and testing scores
train_scores = []
test_scores = []

# For loop 
for k in range(1, 100, 10):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train_scaled, y_train_scaled)
    train_score = knn.score(X_train_scaled, y_train_scaled)
    test_score = knn.score(X_test_scaled, y_test_scaled)
    train_scores.append(train_score)
    test_scores.append(test_score)
    #print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 100, 10), train_scores, marker='o')
plt.plot(range(1, 100, 10), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
# Note that k: 21 seems to be the best choice for this dataset

# Define the model
knn = KNeighborsClassifier(n_neighbors=21)

# Fit the model
gs = knn.fit(X_train_scaled, y_train_scaled)
print('k=21 Test Acc: %.3f' % knn.score(X_test_scaled, y_test_scaled))

In [None]:
print(f"Training Data Score: {knn.score(X_train_scaled, y_train_scaled)}")
print(f"Testing Data Score: {knn.score(X_test_scaled, y_test_scaled)}")

In [None]:
# Print off parameters for the KNN model
print(gs)

In [None]:
# Create the GridSearchCV model
from sklearn.model_selection import GridSearchCV

# Set up the parameters we want to check
param_grid = {'leaf_size': [1, 2, 5, 30, 36], 'n_neighbors': [11, 19, 21, 23, 51], 
             'p': [1, 2], 'weights': ['uniform', 'distance'],
             'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']}

# Define the grid
grid = GridSearchCV(knn, param_grid, verbose=3)

In [None]:
# Train the model with GridSearch
grid.fit(X_train_scaled,y_train_scaled)

In [None]:
# Gridsearch Results
print(grid.best_params_)
print(grid.best_score_) # best TESTING score

In [None]:
# Look at feature importances
# KNN doesn't have a feature importances, so we'll use ExtraTreesClassifier
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier()
model.fit(X_train_scaled,y_train_scaled)

# For the first go-round, do index=X.columns
# Now, what's below is the final feature selection
feat_importances = pd.Series(model.feature_importances_, index=X.columns)
feat_importances.nlargest(60)

In [None]:
# Classification Report
from sklearn.metrics import classification_report

predictions = gs.predict(X_test_scaled)
print(classification_report(y_test_scaled, predictions))

# OBSERVATIONS

Before doing any feature selection, performed pd.get_dummies and kept all inputs except 'Score', 'Class', and 'Case No'. K=21 appears to be where the graph levels off, so using that value, got an accuracy score on the testing data of .925     
Training data came back at ~.947  After gridsearch, best parameters were: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 51, 'p': 1, 'weights': 'distance'}       

'Best testing score': 0.942652329749104     

CLASSIFICATION REPORT:      
              
                  precision    recall  f1-score   support        

           0       0.97      0.92      0.94       190
           1       0.85      0.93      0.89        90      

       micro avg       0.93      0.93      0.93       280       
       macro avg       0.91      0.93      0.92       280       
    weighted avg       0.93      0.93      0.93       280         
     samples avg       0.93      0.93      0.93       280       


--------
Second go-round, going to keep questions plus age, sex, family history, and jaundice (ranked in that order on feature selection).  Ethnicity ranks high, but I'm going to leave that out.  I did look at the percentages of autism within the various ethnicities, which can be found in 'Stats_Notebooks', 'Other_Stats'.  For feature selection, I'm only going to focus on questions, age, jaundice, family history, and sex.       

So with only questions, age, sex, family history, and jaundice.  Got better results.  Again, graph seems to even out around k=21.  Accuracy improved to 0.954  Training score of ~.9618     
Best parameters: {'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 51, 'p': 2, 'weights': 'distance'}       

'Best testing score': 0.965352449223417     

CLASSIFICATION REPORT:      

                 precision    recall  f1-score   support

           0       1.00      0.93      0.96       190
           1       0.87      1.00      0.93        90

       micro avg       0.95      0.95      0.95       280
       macro avg       0.94      0.97      0.95       280
    weighted avg       0.96      0.95      0.95       280
     samples avg       0.95      0.95      0.95       280     
     
 --------   
Third go-round: now going to remove jaundice.  K still seems to level off around 21.  Accuracy of .964, so a full percent higher than with jaundice.  Training score only slightly higher at .965  With gridsearch, again only a small boost:     
{'algorithm': 'auto', 'leaf_size': 1, 'n_neighbors': 19, 'p': 2, 'weights': 'distance'}     

'Best testing score': 0.966547192353644     

CLASSIFICATION REPORT:  
             
                 precision    recall  f1-score   support

           0       1.00      0.95      0.97       190
           1       0.90      1.00      0.95        90

        micro avg       0.96      0.96      0.96       280
       macro avg       0.95      0.97      0.96       280
     weighted avg       0.97      0.96      0.96       280
      samples avg       0.96      0.96      0.96       280     
 -------     


Fourth go-round: removing family history now.  Again, k levels off around 21.  No change in testing accuracy, still at .964, the same number with family history.  Training data made a 'significant' jump to ~.976  After gridsearch:     
{'algorithm': 'ball_tree', 'leaf_size': 30, 'n_neighbors': 19, 'p': 2, 'weights': 'distance'}     

'Best testing score': 0.973715651135006 -- jumped from .964, so not bad.  Interesting to note that the algorithm changed to ball_tree, and leaf_size to 30. Neighbors also changed to 19 (from 51) for the last two.       

CLASSIFICATION REPORT:     
              
                 precision    recall  f1-score   support

           0       0.99      0.95      0.97       190
           1       0.91      0.99      0.95        90

       micro avg       0.96      0.96      0.96       280
       macro avg       0.95      0.97      0.96       280
    weighted avg       0.97      0.96      0.96       280
     samples avg       0.96      0.96      0.96       280

---------
Fifth go-round: removing sex. K at 21, testing accuracy at .968 so not a huge difference from the 3rd and 4th go-rounds. But training data jumped to ~.986 which again is another full percent from when sex was an included input.  After gridsearch:  {'algorithm': 'brute', 'leaf_size': 1, 'n_neighbors': 19, 'p': 2, 'weights': 'distance'}     

'Beste testing score': 0.986857825567503 --testing score jumped roughly 2% from before the hyperparameter tuning.  Algorithm changed to 'brute', which is a first.  Notice weights has never changed.      

CLASSIFICATION REPORT:     
             
                  precision    recall  f1-score   support

           0       1.00      0.95      0.98       190
           1       0.91      1.00      0.95        90

       micro avg       0.97      0.97      0.97       280
       macro avg       0.95      0.98      0.96       280
    weighted avg       0.97      0.97      0.97       280
     samples avg       0.97      0.97      0.97       280
--------     
Final go-round, removing age.  K again at 21, testing accuracy at .979 which is the highest of all tests.  Training data at ~.981  After gridsearch:  {'algorithm': 'auto', 'leaf_size': 5, 'n_neighbors': 21, 'p': 2, 'weights': 'distance'}     

'Best testing score': 0.986857825567503 --same testing score with and without age. Change in algorithm, leaf size, neighbors.     

CLASSIFICATION REPORT:       
                   
                   precision    recall  f1-score   support

           0       0.98      0.98      0.98       190
           1       0.97      0.97      0.97        90

      micro avg       0.98      0.98      0.98       280
      macro avg       0.98      0.98      0.98       280
    weighted avg       0.98      0.98      0.98       280
    samples avg       0.98      0.98      0.98       280



## Observations on the classification reports above
DEFINITIONS:     
<b>precision</b> is the ability of the classifier not to label as positive a sample that is negative     
<b>recall</b> is the ability of the classifier to find all the positive samples     
<b>F1 score</b> also known as balanced F-score or F-measure.  It's the weighted average of the precision and recall scores, with a best score of 1, worst of 0.     
      
*Note that in binary classification, recall of the positive class is also known as “sensitivity”; recall of the negative class is “specificity”.*

NOTE: I don't fully understand the various averages.  Here's what I have so far:     
<b>macro average</b> (averaging the unweighted mean per label);     
<b>weighted average</b> (averaging the support-weighted mean per label);     
<b>sample average</b> (only for multilabel classification);     
<b>micro average</b> (averaging the total true positives, false negatives and false positives) is only shown for multi-label or multi-class with a subset of classes, because it corresponds to accuracy otherwise.        
     
---------     
### <u>F1 score</u> ###       
The F1 score had the smallest variance (1%) between label '0' and '1' of all five input tweaks.  The highest variance came before any feature selection, where I only removed 'Case No', 'Class', and 'Score'.  The variance was 5%.  All other input changes resulted in a variance of 2-3%. Also, the F1 Scores were the highest for the '0' input.  Class 'NO' (meaning no autism) has 190 instances versus Class 'YES', so perhaps that explains why the F1 Score is higher for the '0' input across the board.

### <u>Precision</u> ### 
Precision is the ability to accurately label true negatives.  Precision was higher across the board for a classification of No (=0), which makes sense since that sample size is double the yes pool.  The precision for the 'No' classification was pretty steady, varying between .97 and 1, whereas the precision for the 'Yes' classification ranged from .85 to .97.      

The precision of the '1' label didn't improve until the last feature selection tweak.  It jumped from 0.91 to .97 in the last feature selection, where we removed all except for the questions.  Can we draw any conclusions from this?  Can we safely assume that the discrepency between labels is due to sample size? I'm unsure.     

### <u>Recall</u> ###
Precision and recall are related, recall being the ability to accurately label true positives.  So for recall, the 'yes' classification scored higher for all alterations to the feature selection.  So the model can more accurately predict when a person DOES have autism.   

### <u>Averages</u> ###
Doing gridsearch, the best weights parameter came back as 'distance.' Does this have any relation to the averages in the classification report?