# TKO2096 
## EXERCISE II Prediction of the metal ion content from multi-parameter data
YUE MA 520790

In [1]:
import numpy as np
import pandas as pd
import pdb
import operator
import random

## Load and preprocess data

In [2]:
water_data=pd.read_csv("/Users/mayue/Desktop/TKO-2096/Exe/Water_data.csv")

In [3]:
display(water_data)

Unnamed: 0,c_total,Cd,Pb,Mod1,Mod2,Mod3
0,0,0.0,0.0,9945,119,72335
1,0,0.0,0.0,10786,117,82977
2,0,0.0,0.0,10812,120,98594
3,14,0.0,14.0,9742,127,154323
4,14,0.0,14.0,10566,108,136416
5,14,0.0,14.0,8495,120,131672
6,14,2.8,11.2,10400,134,96528
7,14,2.8,11.2,8298,113,99239
8,14,2.8,11.2,8563,130,113979
9,14,5.6,8.4,9879,130,87882


In [4]:
"""split the data into attributes and targets"""
y=water_data.iloc[:,0:3]
X=water_data.iloc[:,3:6]


In [5]:
"""normalize the data"""
from scipy.stats import zscore
y=zscore(y)
X=zscore(X)

In [6]:
# """split data into train set and test set"""
# n=water_data.shape[0]
# m=150 #the number of train instances
# X_train=X[0:m,:]
# y_train=y[0:m,:]
# X_test=X[m:n,:]
# y_test=y[m:n,:]
# print(X_train.shape[0],X_test.shape[0])

In [7]:
"""randomly split data into train set and test set"""
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9)

## Define the model

In [8]:
class KNN_Regression():
    """set the hyper parameter K"""
    def __init__(self,num_neighbors=3):
        self.num_neighbors=num_neighbors
        
    def fit(self,attributes,targets):
        """let the classfier 'memorize' the trainset"""
        """attributes and targets are arrays"""
        self.attributes=attributes
        self.all_targets=targets #store the copy of all different kinds of labels
        self.targets=targets#labels we want to predict for now
        
        
    def predict(self,inX):
        """predict an input instance's label """
        """inX is an 1-d array which represents an instance"""
        size=self.attributes.shape[0]
        diffMat=np.tile(inX, (size,1))-self.attributes
#     pdb.set_trace()
        sqDiffMat=diffMat**2
        sqDistances=sqDiffMat.sum(axis=1)
        distances=sqDistances**0.5
        sortedDistanceIndices=distances.argsort()
        valueCount=[]
        for i in range(self.num_neighbors):
            voteIvalue=self.targets[sortedDistanceIndices[i]]
            valueCount.append(voteIvalue)
        predictedValue=np.mean(valueCount)
        return predictedValue

In [9]:
class KNN_Regression_Multi(KNN_Regression):


    def select_label(self,indice):
        """fit the model using selected labels"""
        self.targets=self.all_targets[:,indice]
        
    def predict_multi(self,inX,label_index):
        """predict for multi label values,return a list of predicted values"""
        """label_indice: a list of index of labels to be predicted"""
        n=self.all_targets.shape[1]
        pred=[]
        for i in label_index:
            self.select_label(i)#change the label to be predicted
            pred.append(self.predict(inX))
        return pred
    
    def predict_ion(self,inX):
        """this function is to solve the specific requirement of the case, where one lable c_total is the sum of other 2 labels"""
        pred=[]
        temp=self.predict_multi(inX,[1,2])
        pred.append(temp[0])
        pred.append(temp[1])
#         pdb.set_trace()
        pred.append(sum(pred))
        pred=[pred[2],pred[0],pred[1]] #arrange the order of the values
        return pred

To make the KNN predictor works on predicting multi values, I have created a subclass of `KNN_Regression_Multi`. It can produce multi predicted values at a meanwhile.

Another tricky problem I have met is that the case is special which include a label `c_total` as the sum of other labels. So I have to build a new special function to get the better predictions

In [10]:
"""simplely test the model"""
a=KNN_Regression_Multi(3)
a.fit(X_train,y_train)
print(a.predict_ion(X_train[1]))
print("----------")
print(a.predict_multi(X_train[1],[0,1,2]))

[1.6998895486271492, 2.120376322042721, -0.420486773415572]
----------
[1.0794694930823088, 2.120376322042721, -0.420486773415572]


We can easily find that the differences of two predicting functions.

## Implementation of the C-index. 

In [11]:
def C_index(pred,true_labels):
    """pred and true_labels are sequences which have same length"""
    n=0
    h_sum=0.0
    for i in range(0,len(true_labels)):
        t=true_labels[i]
        p=pred[i]
        for j in range(i+1,len(true_labels)):
            nt=true_labels[j]
            np=pred[j]
            if t!=nt:
                n=n+1
#                 pdb.set_trace()
                if ((p<np)&(t<nt))|((p>np)&(t>nt)):
                    h_sum+=1.0
                else:
                    if p==np:
                        h_sum+=0.5
    return float(h_sum/n)

In [12]:
"""test the C_index function,the result should be 0.75"""
C_index([0.60,0.80,0.75,0.75,0.7],[-1,1,1,-1,1])

0.75

The function works because it return correct values over given data.

In [13]:
def C_index_scorer(predictor,X_test,y_test):
    """evaluate prediction performance using C—index over the data set"""
#     results=[]
    scores=[]
    for inX,iny in zip(X_test,y_test):
        pred=predictor.predict_ion(inX) #should consider the methods of the object therefore are defective
#         pdb.set_trace()
#         results+=pred
        scores.append(C_index(pred,iny))
    return scores

## Define CVs

In [14]:
class LOO_CV():
    
    def load_data(self,attributes,targets):
        """load the train data before validating"""
#         random.shuffle(attributes)
#         random.shuffle(targets)
        self.attributes=attributes
        self.targets=targets
        
    def CV_eval(self,predictor_class,scorer,num_neighbor):
        """predictor_class is the predictor CLASS used to validate"""
        """scorer is a function used to evaluate the performance over dataset"""
        """num_neighbor is a contemporary parameter for this exercise, which is used to set the KNN model"""
        """the results of each validation are represented in a dataframe"""
        m=self.attributes.shape[0]
        all_results={}
        all_scores={}
        for i in range(0,m):
            #split train set and validate set
            X_validate=self.attributes[i]
            y_validate=self.targets[i]
            X_validate=X_validate.reshape(1,X_validate.shape[0])
            y_validate=y_validate.reshape(1,y_validate.shape[0])#make sure they are 2-d array
            X_train=np.delete(self.attributes,i,0)
            y_train=np.delete(self.targets,i,0)
#             pdb.set_trace()
#             hp=hyper_parameter_values.keys()[0]#get the name of the parameter
            predictor=predictor_class(num_neighbor)
            predictor.fit(X_train,y_train)
            pred=predictor.predict_ion(X_validate)
            all_results[str(i)]=pred
            all_scores[str(i)]= scorer(predictor,pred,y_validate)
#         self.all_results=pd.DataFrame.from_dict(all_results,orient='index',columns=['c_total','cd','pb'])
        self.all_scores=pd.DataFrame.from_dict(all_scores,orient='index',columns=['c-index'])
        self.all_results=pd.DataFrame.from_dict(all_results,orient='index',columns=['c_total','cd','pb'])
        return self.all_scores
            

In [15]:
class L3O_CV(LOO_CV):
    
    def CV_eval(self,predictor_class,scorer,num_neighbor):
        """predictor_class is the predictor CLASS used to validate"""
        """scorer is a function used to evaluate the performance over dataset"""
        """num_neighbor is a contemporary parameter for this exercise, which used to set the KNN model"""
        """the results of each validation are represented in a dataframe"""
        m=self.attributes.shape[0]
        all_results={}
        all_scores={}
        for i in range(0,m,3):
            #split train set and validate set
            X_validate=self.attributes[i:i+3]
            y_validate=self.targets[i:i+3]          
            X_train=np.delete(self.attributes,[i,i+1,i+2],0)
            y_train=np.delete(self.targets,[i,i+1,i+2],0)
#             pdb.set_trace()
            predictor=predictor_class(num_neighbor)
            predictor.fit(X_train,y_train)
#             all_results[str(i)]=scorer(predictor,X_validate,y_validate)
#         self.mean_score=np.mean(all_results)
#         self.eval_results=pd.DataFrame.from_dict(all_results,orient='index',columns=['c-index_1','c-index_2','c-index_3'])
#         return self.eval_results

            all_scores[str(i)]= scorer(predictor,X_validate,y_validate)# this step is special for solving this case

                
#         self.mean_score=np.mean(all_results)
        self.all_scores=pd.DataFrame.from_dict(all_scores,orient='index',columns=['c_total','cd','pb'])
#         self.all_scores=pd.DataFrame.from_dict(all_scores,orient='index',columns=['c-index'])
        return self.all_scores
            

## Calculate the C-index based on LOO CV and L3O CV

In [16]:
"""Calculate c-index for different k values using LOO CV"""
loo=LOO_CV()
loo.load_data(X_train,y_train)

namelist=[]
for i in range(1,6):
    name='loo_score_'+str(i)
    namelist.append(name)
    locals()[name]=loo.CV_eval(KNN_Regression_Multi,C_index_scorer,i)

In [17]:
loo_scores=pd.concat([loo_score_1,loo_score_2,loo_score_3,loo_score_4,loo_score_5],axis=1)
loo_scores.columns=['k=1','k=2','k=3','k=4','k=5']
loo_scores


Unnamed: 0,k=1,k=2,k=3,k=4,k=5
11,0.333333,0.0,0.0,0.0,0.0
10,0.333333,1.0,1.0,1.0,1.0
13,1.0,0.333333,0.333333,0.333333,0.333333
12,0.333333,1.0,1.0,1.0,1.0
15,0.666667,0.666667,0.666667,0.666667,0.666667
14,0.0,0.333333,0.333333,0.333333,0.333333
17,1.0,0.666667,1.0,1.0,0.666667
16,0.666667,1.0,0.666667,0.333333,0.333333
19,0.666667,1.0,0.333333,0.333333,0.333333
18,0.333333,1.0,1.0,1.0,1.0


In [19]:
loo_scores.mean()

k=1    0.500000
k=2    0.750000
k=3    0.633333
k=4    0.633333
k=5    0.550000
dtype: float64

firstly output the all c-index results using LOO. When k=2 there should a best performance for the given KNN regression model. To get the unbiased result, I set the k=1 and test this model over new data, which is the test set

In [22]:
loo_best=KNN_Regression_Multi(num_neighbors=2)
loo_best.fit(X_train,y_train)
np.mean(C_index_scorer(loo_best,X_test,y_test))#the performance of best model over test set

0.7569060773480663

In [23]:
"""Calculate c-index for different k values using L3O CV"""
l3o=L3O_CV()
l3o.load_data(X_train,y_train)

namelist=[]
for i in range(1,6):
    name='l3o_score_'+str(i)
    namelist.append(name)
    locals()[name]=l3o.CV_eval(KNN_Regression_Multi,C_index_scorer,i)

l3o_scores=pd.concat([l3o_score_1,l3o_score_2,l3o_score_3,l3o_score_4,l3o_score_5],axis=1)
l3o_scores.columns=[['k=1','k=1','k=1','k=2','k=2','k=2','k=3','k=3','k=3','k=4','k=4','k=4','k=5','k=5','k=5'],['C1','C2','C3','C1','C2','C3','C1','C2','C3','C1','C2','C3','C1','C2','C3']]
display(l3o_scores)

  from ipykernel import kernelapp as app
  app.launch_new_instance()


Unnamed: 0_level_0,k=1,k=1,k=1,k=2,k=2,k=2,k=3,k=3,k=3,k=4,k=4,k=4,k=5,k=5,k=5
Unnamed: 0_level_1,C1,C2,C3,C1,C2,C3,C1,C2,C3,C1,C2,C3,C1,C2,C3
12,0.333333,0.666667,0.333333,0.5,0.666667,0.333333,0.666667,0.666667,0.333333,1.0,0.666667,0.333333,1.0,0.666667,0.333333
15,1.0,1.0,0.666667,1.0,1.0,0.666667,1.0,0.333333,0.666667,1.0,0.333333,0.666667,1.0,0.333333,0.666667
18,0.666667,1.0,,0.666667,1.0,,0.666667,0.333333,,0.666667,0.333333,,1.0,0.666667,
0,1.0,0.333333,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.333333,1.0,1.0,0.333333,1.0,1.0
3,1.0,0.333333,1.0,0.666667,0.333333,1.0,0.666667,0.333333,1.0,0.666667,0.0,1.0,0.666667,0.0,0.333333
6,1.0,0.0,0.666667,0.333333,0.0,0.666667,0.333333,0.333333,0.666667,0.333333,0.333333,0.666667,0.333333,0.333333,0.666667
9,1.0,0.666667,0.333333,1.0,0.666667,0.333333,1.0,0.666667,0.0,1.0,0.666667,0.333333,0.333333,0.666667,0.0


In [24]:
l3o_scores_mean=l3o_scores.mean(axis=1,level=0).mean()
print(l3o_scores_mean)

k=1    0.706349
k=2    0.698413
k=3    0.626984
k=4    0.611111
k=5    0.579365
dtype: float64


In [25]:
l3o_scores_mean.idxmax()

'k=1'

then produce the results of L3O. We can find that averagely this CV gets better performance on train set than LOO. And for this evaluation approach, the best k should be 1, the same as LOO. 

In [26]:
l3o_best=KNN_Regression_Multi(1)
l3o_best.fit(X_train,y_train)
np.mean(C_index_scorer(l3o_best,X_test,y_test))#the performance of best model over test set

0.7808471454880295

- A tricky problem for me here is that how I should use the C-index. This is to say, Should I calculate predicted labels and true labels for each instance, or should I calculte all the predicted values and all true values as for EACH SINGLE label? I have tried both, and the present version may be the solution most suited to the question.
- Firstly I want to make CV as more non-specifical as possible, but it is very difficult. So some steps here have not been encapsulated perfectly. I will try to imporve this point in the follow-up exercises.
-  I have guessed that L3O should generalize better. Actually LOO gets worse perfermance on train data than L3O. The reason here might be that validating model on three samples once will make the score more biased than on only one sample.
- For test data, LOO's best model has worse performance than L3O's best model. This means for this case L3O generalize better than LOO
- But I can' t simply conclude that who generalize better because the data is not enough. I have tried different proportion of test set and results differ.
- Intuitively I think the evaluation of L3O might be more stable because it validate on more instances thefore avoids contingency more, which will also make scores optimisticly biased sometimes. For this exercise there are copies for samples, this may make the results influenced ,too.