# kNN
## Multilabel, imbalanced
## finding optimal k

In [None]:
import pandas as pd #import libraries
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

#import data
df=pd.read_csv('C:/Users/nithi/Google Drive/Python/Student Data/SCADA_and_downtime.csv',skip_blank_lines=True)

list1=list(df['turbine_id'].unique()) #list of turbines to plot
list1=sorted(list1,key=int) #sort turbines in ascending order
list2=list(df['TurbineCategory_id'].unique()) #list of categories 
list2=[g for g in list2 if g>=0] #remove NaN from list
list2=sorted(list2,key=int) #sort categories in ascending order
list2=[m for m in list2 if m not in (1,12,13,14,15,17,21,22)] #categories to remove 
list4=list(range(0,14))
kp,kr,kf,pre,rec,f1s=[],[],[],[],[],[]

for x in list1: #filter only data for turbine x
    dfx=df[(df['turbine_id']==x)].copy()
    for y in list2: #copying fault to new column (mins) (fault when turbine category id is y)
        def ff(c):
            if c['TurbineCategory_id']==y:
                return 0
            else:
                return 1
        dfx['mins']=dfx.apply(ff,axis=1)
        
        dfx=dfx.sort_values(by="timestamp",ascending=False) #sort values by timestamp in descending order
        dfx.reset_index(drop=True,inplace=True) #reset index
        
        if dfx.loc[0,'mins']==0: #assigning value to first cell if it's not 0
            dfx.set_value(0,'mins',0)
        else:
            dfx.set_value(0,'mins',999999999)

        for i,e in enumerate(dfx['mins']): #using previous value's row to evaluate time
            if e==1:
                dfx.at[i,'mins']=dfx.at[i-1,'mins']+10

        dfx=dfx.sort_values(by="timestamp") #sort in ascending order
        dfx.reset_index(drop=True,inplace=True) #reset index
        dfx['hours']=dfx['mins'].astype(np.int64) #convert to hours, then round to nearest hour
        dfx['hours']=dfx['hours']/60
        dfx['hours']=round(dfx['hours']).astype(np.int64)
        
        def f11(c): #>48 hours - label as normal (9999)
            if c['hours']>48:
                return 9999
            else:
                return c['hours']
        dfx['hours']=dfx.apply(f11,axis=1)
        
        def f22(c): #filter out curtailment - curtailed when turbine is pitching outside 0deg<= normal <=3.5deg
            if 0<=c['pitch']<=3.5 or c['hours']!=9999 or ((c['pitch']>3.5 or c['pitch']<0) and 
                                                          (c['ap_av']<=(.1*dfx['ap_av'].max()) 
                                                           or c['ap_av']>=(.9*dfx['ap_av'].max()))):
                return 'normal' 
            else:
                return 'curtailed'
        dfx['curtailment']=dfx.apply(f22,axis=1)

        def f3(c): #filter unusual readings, i.e. for normal operation, power <=0 in operating wind speeds, power >100... 
                    #before cut-in, runtime <600 and other downtime categories
            if c['hours']==9999 and ((3<c['ws_av']<25 and (c['ap_av']<=0 or c['runtime']<600 or 
                                                           c['EnvironmentalCategory_id']>1 or c['GridCategory_id']>1 or 
                                                           c['InfrastructureCategory_id']>1 or 
                                                           c['AvailabilityCategory_id']==2 or 
                                                           12<=c['TurbineCategory_id']<=15 or 
                                                           21<=c['TurbineCategory_id']<=22)) or 
                                                           (c['ws_av']<3 and c['ap_av']>100)): 
                return 'unusual' 
            else:
                return 'normal'
        dfx['unusual']=dfx.apply(f3,axis=1)

        def f4(c): #round to 6 hour intervals
            if c['hours']==0:
                return 10
            elif 1<=c['hours']<=6:
                return 11
            elif 7<=c['hours']<=12:
                return 12
            elif 13<=c['hours']<=18:
                return 13
            elif 19<=c['hours']<=24:
                return 14
            elif 25<=c['hours']<=30:
                return 15
            elif 31<=c['hours']<=36:
                return 16
            elif 37<=c['hours']<=42:
                return 17
            elif 43<=c['hours']<=48:
                return 18
            else:
                return 19
        dfx['hours6']=dfx.apply(f4,axis=1)
        
        def f5(c): #change label for unusual and curtailed data (20)
            if c['unusual']=='unusual' or c['curtailment']=='curtailed':
                return 20
            else:
                return c['hours6']
        dfx['hours_%s'%y]=dfx.apply(f5,axis=1)
    
        dfx=dfx.drop('hours6',axis=1) #drop unnecessary columns
        dfx=dfx.drop('hours',axis=1)
        dfx=dfx.drop('mins',axis=1)
        dfx=dfx.drop('curtailment',axis=1)
        dfx=dfx.drop('unusual',axis=1)
        
    #separate features from classes for classification
    features=['ap_av','ws_av','wd_av','pitch','ap_max','ap_dev','reactive_power','rs_av','gen_sp','nac_pos']
    classes=[col for col in dfx.columns if 'hours' in col]
    list6=features+classes #list of columns to copy into new df
    df2=dfx[list6].copy()
    df2=df2.dropna() #drop NaNs
    X=df2[features] 
    X=preprocessing.normalize(X) #normalise features to values b/w 0 and 1
    Y=df2[classes] 
    Y=Y.as_matrix() #convert from pd dataframe to np array
    tscv=TimeSeriesSplit(n_splits=5) #cross validation using time series split
    myList=list(range(1,200))  #evaluating optimal number of trees #creating odd list of n
    neighbours=list(filter(lambda x:x%2!=0,myList))
    
    p,r,f=[],[],[]
    for k in neighbours: #looping for each value of n and defining random forest classifier
        knn=KNeighborsClassifier(weights='distance',n_jobs=-1,n_neighbors=k)
        a1,p1,r1,f1=[],[],[],[]
        for train_index,test_index in tscv.split(X): #looping for each cross validation fold
            X_train,X_test=X[train_index],X[test_index] #split train and test sets
            Y_train,Y_test=Y[train_index],Y[test_index]
            knn1=knn.fit(X_train,Y_train) #fit to classifier and predict
            Yp=knn1.predict(X_test)
            a2,p2,r2,f2=[],[],[],[]
            for m in list4:
                Yt=Y_test[:,m]
                Ypr=Yp[:,m]
                p_s=precision_score(Yt,Ypr,average='weighted')
                r_s=recall_score(Yt,Ypr,average='weighted')
                f_s=f1_score(Yt,Ypr,average='weighted')
                p2.append(p_s),r2.append(r_s),f2.append(f_s) 
            p1.append(np.mean(p2)),r1.append(np.mean(r2)),f1.append(np.mean(f2))
        p.append(np.mean(p1)),r.append(np.mean(r1)),f.append(np.mean(f1))
    kp.append(neighbours[p.index(max(p))]),kr.append(neighbours[r.index(max(r))]),kf.append(neighbours[f.index(max(f))])
    pre.append(max(p)),rec.append(max(r)),f1s.append(max(f))
    
d=pd.DataFrame(list1,columns=['turbine'])
d['kp']=kp
d['kr']=kr
d['kf']=kf
d['precision']=pre
d['recall']=rec
d['f1']=f1s

In [2]:
d

Unnamed: 0,turbine,kp,kr,kf,precision,recall,f1
0,1,169,175,175,0.791794,0.822825,0.798513
1,2,167,185,185,0.852676,0.877657,0.860082
2,3,5,15,5,0.886624,0.893357,0.886841
3,4,1,9,9,0.886866,0.89536,0.886279
4,5,3,19,5,0.891104,0.910093,0.892272


In [4]:
d

Unnamed: 0,turbine,kp,kr,kf,precision,recall,f1
0,6,7,15,7,0.875538,0.886068,0.870467
1,7,197,193,193,0.831667,0.827263,0.820586
2,8,21,33,33,0.845641,0.861834,0.843192
3,9,3,5,3,0.874436,0.879842,0.87096
4,10,3,47,13,0.881604,0.890181,0.877306


In [6]:
d

Unnamed: 0,turbine,kp,kr,kf,precision,recall,f1
0,11,7,35,111,0.862236,0.872568,0.858012
1,12,41,41,41,0.86023,0.877434,0.862543
2,13,39,69,69,0.849221,0.867934,0.850684
3,14,3,17,5,0.913416,0.927325,0.9135
4,15,7,13,13,0.80095,0.824893,0.804398


In [8]:
d

Unnamed: 0,turbine,kp,kr,kf,precision,recall,f1
0,16,1,5,1,0.847872,0.841106,0.829046
1,17,15,167,167,0.84492,0.862531,0.849888
2,18,5,7,3,0.883764,0.886982,0.866699
3,19,5,7,5,0.903697,0.914291,0.903025
4,20,7,7,3,0.839797,0.852354,0.822786


In [10]:
d

Unnamed: 0,turbine,kp,kr,kf,precision,recall,f1
0,21,1,7,3,0.913511,0.924246,0.910914
1,22,5,7,7,0.91818,0.926138,0.917425
2,23,13,13,13,0.90385,0.912259,0.899484
3,24,1,5,1,0.859175,0.839408,0.825961
4,25,3,13,5,0.908905,0.923823,0.910643
