In [None]:
#import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import neighbors
from sklearn.metrics import mean_squared_error 
from math import sqrt
from sklearn.metrics import r2_score
from pyearth import Earth
from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#import data set and creat feature and target space
df = pd.read_csv(r'Volumetric_features.csv')
X=df.drop(columns=['S.No','dataset','Age'])
y=df['Age']

df.head(10)

In [None]:
#age distribution in the data set
df['Age'].hist(bins=10)
plt.ylabel('Number of Samples')
plt.xlabel('Age')
plt.title('Age Distribution')
plt.show()

In [None]:
#creat correlation matrix
df_new =df.drop(columns=['S.No','dataset'])
correlation_matrix = df_new.corr()
correlation_matrix['Age']
dataplot = sns.heatmap(correlation_matrix, cmap="YlGnBu", annot=False)
plt.show()

In [None]:
#printing features that have important correlation with diagnosis
cor_target = abs(correlation_matrix['Age']) 
relevant_features = cor_target[cor_target>0.2]
print(relevant_features) 

features_to_drop = cor_target[cor_target<0.2] #script to eliminate features with low correlation
to_drop_frame = features_to_drop.to_frame()
row_names = to_drop_frame.index
row_names_list = list(row_names)
row_names_list.append('Age')
row_names_list.append('S.No')
row_names_list.append('dataset')
y = df['Age'].values
x = df.drop(row_names_list, axis=1)

In [None]:
#data space for model with and without feature selection
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)
X_train2,X_test2,y_train2,y_test2 = train_test_split(x,y,test_size=0.25) #model with feature selection

#scaling features
scaler = StandardScaler()
scaler2 = StandardScaler()
scaler.fit(X_train)
scaler2.fit(X_train2)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train2 = scaler2.transform(X_train2)
X_test2 = scaler2.transform(X_test2)

In [None]:
#elbow curve for model w/o feature selection
rmse_val = [] #to store rmse values for different k
a = []
for K in range(80):
    K = K+1
    model = neighbors.KNeighborsRegressor(n_neighbors = K)

    model.fit(X_train, y_train)  #fit the model
    pred=model.predict(X_test) #make prediction on test set
    error = sqrt(mean_squared_error(y_test,pred)) #calculate rmse
    rmse_val.append(error) #store rmse values
    a.append(K)

plt.plot(a, rmse_val)
plt.ylabel('RMSE')
plt.xlabel('Number of Neighbors')
plt.title('Neighbors Optimization w/o Feature Selection')
plt.show()

In [None]:
#elbow curve for model w/ feature selection
rmse_val2 = [] #to store rmse values for different k
a2 = []
for K in range(80):
    K = K+1
    model2 = neighbors.KNeighborsRegressor(n_neighbors = K)

    model2.fit(X_train2, y_train2)  #fit the model
    pred2=model2.predict(X_test2) #make prediction on test set
    error2 = sqrt(mean_squared_error(y_test2,pred2)) #calculate rmse
    rmse_val2.append(error2) #store rmse values
    a2.append(K)

plt.plot(a2, rmse_val2)
plt.ylabel('RMSE')
plt.xlabel('Number of Neighbors')
plt.title('Neighbors Optimization w/ Feature Selection')
plt.show()

In [None]:
#KNN Models
model = neighbors.KNeighborsRegressor(n_neighbors = 12)
model.fit(X_train, y_train)  #fit the model
pred=model.predict(X_test) #make prediction on test set
error = (mean_squared_error(y_test,pred)) #calculate rmse

print('Model accuracies without feature selection')
print(f'RSQ: {r2_score(y_test,pred):.3f}')
print(f'MSE: {error:.3f}')

model2 = neighbors.KNeighborsRegressor(n_neighbors = 7)
model2.fit(X_train2, y_train2)  #fit the model
pred2=model2.predict(X_test2) #make prediction on test set
error2 = (mean_squared_error(y_test2,pred2)) #calculate rmse

print('Model accuracies with feature selection')
print(f'RSQ: {r2_score(y_test2,pred2):.3f}')
print(f'MSE: {error2:.3f}')