In [43]:
import pandas as pd
import numpy as np
from plotnine import *
%matplotlib inline
from sklearn.neighbors import KNeighborsRegressor as knn
from sklearn.model_selection import train_test_split

In [44]:
df = pd.read_csv("HousingData.csv")
df = df.dropna()
df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
5,0.02985,0.0,2.18,0.0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21,28.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,0.17783,0.0,9.69,0.0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10,17.5
500,0.22438,0.0,9.69,0.0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33,16.8
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9


In [45]:
# Creating test and train dataframes by splitting the housing df
train, test = train_test_split(df,
                               test_size=0.2, 
                               random_state=42,
                               shuffle=True)
print("The size of the housing dataset", df.shape[0])
print("The size of the train dataset", train.shape[0])
print("The size of the test dataset",test.shape[0] )

The size of the housing dataset 394
The size of the train dataset 315
The size of the test dataset 79


In [46]:
# general function for fitting testing and training data and scoring each parameter
def test_and_train(columnName: str):
    

    columnScore = columnName + "score"
    scoretrain=[knn(n_neighbors=i).fit(train[[columnName]],
                train['MEDV']).score(train[[columnName]],
                train['MEDV']) for i in list(range(5, 79))]

    trainscore=pd.DataFrame(scoretrain, columns=[columnScore])
    trainscore['datsplit']='Training'
    trainscore['neighbors']=list(range(5, 79))

    scoretest=[knn(n_neighbors=i).fit(train[[columnName]],
                train['MEDV']).score(test[[columnName]],
                test['MEDV']) for i in list(range(5, 79))]

    testscore=pd.DataFrame(scoretest, columns=[columnScore])
    testscore['datsplit']='Testing'
    testscore['neighbors']=list(range(5, 79))

    score_df = pd.concat([trainscore, testscore], axis=0)

    (ggplot(score_df, aes(x='neighbors', y=columnScore, color='datsplit'))+
    geom_line()+
    scale_color_manual(values=['darkorange', 'steelblue'])+
     theme_bw()
    )

    print(testscore[columnScore].max())
    print(testscore.loc[testscore[columnScore]==testscore[columnScore].max(), 
                  'neighbors'])



In [47]:
# iterate through all columns and run test/train function for each column name
for col in df.columns:
    # only run test/fit on variable columns, not prediction
    if(col != "MEDV"):
        print(col)
        print("Score:")
        test_and_train(col)
        print()
    
    

CRIM
Score:
0.0899974304372847
25    30
Name: neighbors, dtype: int64

ZN
Score:
0.1821375168483852
2    7
Name: neighbors, dtype: int64

INDUS
Score:
0.168892755861085
39    44
Name: neighbors, dtype: int64

CHAS
Score:
-0.0062210639736766105
14    19
Name: neighbors, dtype: int64

NOX
Score:
0.20289382077450124
11    16
Name: neighbors, dtype: int64

RM
Score:
0.49355856382443675
6    11
Name: neighbors, dtype: int64

AGE
Score:
0.047869097113035
38    43
Name: neighbors, dtype: int64

DIS
Score:
0.03228823742219944
0    5
Name: neighbors, dtype: int64

RAD
Score:
0.01720385985625239
8    13
Name: neighbors, dtype: int64

TAX
Score:
0.16613552666279718
2    7
Name: neighbors, dtype: int64

PTRATIO
Score:
0.14659558791086713
26    31
Name: neighbors, dtype: int64

B
Score:
0.09446814346576782
24    29
Name: neighbors, dtype: int64

LSTAT
Score:
0.4565072688915064
18    23
Name: neighbors, dtype: int64



## Final Results
From these test predictions, the average number of rooms per dwelling (RM) is the best predictor, and our optimized hyperparameter is 11 neighbors

In [49]:
df1 = pd.read_csv("parkinsons.data")
df1

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,...,0.08270,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.10470,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,phon_R01_S50_2,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,...,0.07008,0.02764,19.517,0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,...,0.04812,0.01810,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,phon_R01_S50_4,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,phon_R01_S50_5,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,...,0.03794,0.07223,19.020,0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306
