In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score,  GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

In [None]:
df =  pd.read_csv('possum.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   case      104 non-null    int64  
 1   site      104 non-null    int64  
 2   Pop       104 non-null    object 
 3   sex       104 non-null    object 
 4   age       102 non-null    float64
 5   hdlngth   104 non-null    float64
 6   skullw    104 non-null    float64
 7   totlngth  104 non-null    float64
 8   taill     104 non-null    float64
 9   footlgth  103 non-null    float64
 10  earconch  104 non-null    float64
 11  eye       104 non-null    float64
 12  chest     104 non-null    float64
 13  belly     104 non-null    float64
dtypes: float64(10), int64(2), object(2)
memory usage: 11.5+ KB


In [None]:
df[df['age'].isnull()]

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
43,44,2,Vic,m,,85.1,51.5,76.0,35.5,70.3,52.6,14.4,23.0,27.0
45,46,2,Vic,m,,91.4,54.4,84.0,35.0,72.8,51.2,14.4,24.5,35.0


In [None]:
df['age'].fillna(df[df['sex']=='m']['age'].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['age'].fillna(df[df['sex']=='m']['age'].median(), inplace = True)


In [None]:
df['age'].isnull().sum()

0

In [None]:
df[df['footlgth'].isnull()]

Unnamed: 0,case,site,Pop,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
40,41,2,Vic,f,5.0,88.4,57.0,83.0,36.5,,40.3,15.9,27.0,30.5


In [None]:
df['footlgth'].fillna(df[df['sex']=='f']['footlgth'].median(), inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['footlgth'].fillna(df[df['sex']=='f']['footlgth'].median(), inplace = True)


In [None]:
df['footlgth'].isnull().sum()

0

In [None]:
df['Pop'].unique()

array(['Vic', 'other'], dtype=object)

In [None]:
df['Pop'] = df.Pop.map({'Vic':1,'other':0})

In [None]:
df['Pop'].unique()

array([1, 0])

In [None]:
df['sex'].unique()

array(['m', 'f'], dtype=object)

In [None]:
df['sex'] = df.sex.map({'m':1,'f':0})

In [None]:
df['sex'].unique()

array([1, 0])

In [None]:
df.drop('case', axis = 1, inplace = True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 104 entries, 0 to 103
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   site      104 non-null    int64  
 1   Pop       104 non-null    int64  
 2   sex       104 non-null    int64  
 3   age       104 non-null    float64
 4   hdlngth   104 non-null    float64
 5   skullw    104 non-null    float64
 6   totlngth  104 non-null    float64
 7   taill     104 non-null    float64
 8   footlgth  104 non-null    float64
 9   earconch  104 non-null    float64
 10  eye       104 non-null    float64
 11  chest     104 non-null    float64
 12  belly     104 non-null    float64
dtypes: float64(10), int64(3)
memory usage: 10.7 KB


In [None]:
X = df.drop('footlgth',axis = 1)

In [None]:
Y = df['footlgth']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

In [None]:
KNN_R = KNeighborsRegressor()
cross_val = cross_val_score(KNN_R, X_train, Y_train, cv = 5)
print("the accuracy of training model:",np.mean(cross_val))
fit_model = KNN_R.fit(X_train, Y_train)
Y_pred = fit_model.predict(X_test)
print("the accuracy of testing model:",fit_model.score(X_test,Y_test))


the accuracy of training model: 0.6491603640426657
the accuracy of testing model: 0.7682523045601072


In [None]:
df_result1 = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
df_result1

Unnamed: 0,Actual,Predicted
30,68.0,73.0
65,63.2,64.36
64,68.2,66.18
53,71.3,67.86
45,72.8,72.16
93,64.2,64.88
91,67.6,64.64
47,66.9,67.78
10,77.2,73.82
0,74.5,72.3


In [None]:
grid = {
    'n_neighbors':np.arange(1,50),
    'metric':['euclidean', 'manhattan', 'minkowski' ],
    'p':np.arange(1,3),
    'algorithm':['auto','ball_tree','kd_tree','brute']
}

knn_model = KNeighborsRegressor()
knn_cv = GridSearchCV(knn_model,grid, cv= 3)
Trained = knn_cv.fit(X_train,Y_train)
print("Hyperparameters:",knn_cv.best_params_)
print("Best traing score:",knn_cv.best_score_)
print("Best testing score:",Trained.score(X_test,Y_test))
Y_pred = Trained.predict(X_test)
df_result2 = pd.DataFrame({'Actual2': Y_test, 'Predicted2': Y_pred})
df_result2

Hyperparameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 13, 'p': 1}
Best traing score: 0.7329602159295803
Best testing score: 0.7407732542192409


Unnamed: 0,Actual2,Predicted2
30,68.0,72.853846
65,63.2,64.215385
64,68.2,65.192308
53,71.3,68.0
45,72.8,70.984615
93,64.2,65.269231
91,67.6,64.730769
47,66.9,67.169231
10,77.2,72.823077
0,74.5,72.807692


With scaling

In [None]:
pd.concat([df_result1,df_result2], axis = 1)

Unnamed: 0,Actual,Predicted,Actual2,Predicted2
30,68.0,73.0,68.0,72.853846
65,63.2,64.36,63.2,64.215385
64,68.2,66.18,68.2,65.192308
53,71.3,67.86,71.3,68.0
45,72.8,72.16,72.8,70.984615
93,64.2,64.88,64.2,65.269231
91,67.6,64.64,67.6,64.730769
47,66.9,67.78,66.9,67.169231
10,77.2,73.82,77.2,72.823077
0,74.5,72.3,74.5,72.807692


In [None]:
def scalling(data,scaler):
  scaler = scaler()
  scaler.fit(data)
  return scaler.transform(data)

In [None]:
X_train = scalling(X_train,StandardScaler)
X_test = scalling(X_test, StandardScaler)

In [None]:
KNN_R = KNeighborsRegressor()
cross_val = cross_val_score(KNN_R,X_train,Y_train,cv = 5)
print("the accuracy of training model:",np.mean(cross_val))
fit_model = KNN_R.fit(X_train,Y_train)
Y_pred = fit_model.predict(X_test)
print("the accuracy of testing model:",fit_model.score(X_test,Y_test))
df_result = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
df_result

the accuracy of training model: 0.6491603640426657
the accuracy of testing model: 0.7682523045601072


Unnamed: 0,Actual,Predicted
30,68.0,73.0
65,63.2,64.36
64,68.2,66.18
53,71.3,67.86
45,72.8,72.16
93,64.2,64.88
91,67.6,64.64
47,66.9,67.78
10,77.2,73.82
0,74.5,72.3


In [None]:
grid = {
    'n_neighbors':np.arange(1,50),
    'metric':['euclidean', 'manhattan', 'minkowski' ],
    'p':np.arange(1,3),
    'algorithm':['auto','ball_tree','kd_tree','brute']
}

knn_model = KNeighborsRegressor()
knn_cv = GridSearchCV(knn_model,grid, cv= 3)
knn_cv.fit(X_train,Y_train)
print("Hyperparameters:",knn_cv.best_params_)
print("Best traing score:",knn_cv.best_score_)
print("Best testing score:",knn_cv.score(X_test,Y_test))

Hyperparameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 13, 'p': 1}
Best traing score: 0.7329602159295803
Best testing score: 0.7407732542192409


In [None]:
X_train = scalling(X_train,MinMaxScaler)
X_test = scalling(X_test, MinMaxScaler)

In [None]:
KNN_R = KNeighborsRegressor()
cross_val = cross_val_score(KNN_R,X_train,Y_train,cv = 5)
print("the accuracy of training model:",np.mean(cross_val))
fit_model = KNN_R.fit(X_train,Y_train)
Y_pred = fit_model.predict(X_test)
print("the accuracy of testing model:",fit_model.score(X_test,Y_test))
df_result = pd.DataFrame({'Actual': Y_test, 'Predicted': Y_pred})
df_result

the accuracy of training model: 0.6529344823784479
the accuracy of testing model: 0.7818206496126078


Unnamed: 0,Actual,Predicted
30,68.0,73.0
65,63.2,63.28
64,68.2,65.84
53,71.3,70.26
45,72.8,70.38
93,64.2,64.2
91,67.6,65.26
47,66.9,68.1
10,77.2,73.82
0,74.5,72.4


In [None]:
grid = {
    'n_neighbors':np.arange(1,50),
    'metric':['euclidean', 'manhattan', 'minkowski' ],
    'p':np.arange(1,3),
    'algorithm':['auto','ball_tree','kd_tree','brute']
}

knn_model = KNeighborsRegressor()
knn_cv = GridSearchCV(knn_model,grid, cv= 3)
trained = knn_cv.fit(X_train,Y_train)
print("Hyperparameters:",knn_cv.best_params_)
print("Best traing score:",knn_cv.best_score_)
print("Best testing score:",trained.score(X_test,Y_test))

Hyperparameters: {'algorithm': 'auto', 'metric': 'manhattan', 'n_neighbors': 11, 'p': 1}
Best traing score: 0.7272859890136395
Best testing score: 0.7300789086615378


  _data = np.array(data, dtype=dtype, copy=copy,


In [None]:
0.7300789086615378
0.7300789086615378