In [18]:
import pandas as pd
import seaborn as sb
import numpy as np
import warnings
warnings.simplefilter("ignore")
import sklearn.metrics
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.linear_model import LogisticRegression
import importlib.util
from sklearn.metrics import confusion_matrix
from collections import defaultdict

In [22]:
dataset = pd.read_csv('newHIV-1_data/1625Data.txt', names = ['Peptides', 'Output'])
dataset.head(5)

Unnamed: 0,Peptides,Output
0,SLNLRETN,1
1,AECFRIFD,1
2,HLVEALYL,1
3,TQIMFETF,1
4,AEELAEIF,1


In [23]:
# Seperate all amino acids
peptides = np.array([[dataset["Peptides"][i][j] for i in range(dataset.shape[0])] for j in range(8)])
peptides.shape

(8, 1625)

In [24]:
# Store the seperated amino acids into a dataframe
dataset1= pd.DataFrame(peptides.T, columns=list('ABCDEFGH'))
dataset1.shape

(1625, 8)

In [25]:
# dataset = dataset.join(dataset_One)
dataset = pd.concat([dataset, dataset1], axis=1)
dataset.head()

Unnamed: 0,Peptides,Output,A,B,C,D,E,F,G,H
0,SLNLRETN,1,S,L,N,L,R,E,T,N
1,AECFRIFD,1,A,E,C,F,R,I,F,D
2,HLVEALYL,1,H,L,V,E,A,L,Y,L
3,TQIMFETF,1,T,Q,I,M,F,E,T,F
4,AEELAEIF,1,A,E,E,L,A,E,I,F


In [26]:
dataset = dataset.drop(columns="Peptides", axis=0)
dataset.head()

Unnamed: 0,Output,A,B,C,D,E,F,G,H
0,1,S,L,N,L,R,E,T,N
1,1,A,E,C,F,R,I,F,D
2,1,H,L,V,E,A,L,Y,L
3,1,T,Q,I,M,F,E,T,F
4,1,A,E,E,L,A,E,I,F


In [28]:
# rearrange col dataset
dataset = dataset[['A','B','C','D','E','F','G','H','Output']]
dataset.head()

Unnamed: 0,A,B,C,D,E,F,G,H,Output
0,S,L,N,L,R,E,T,N,1
1,A,E,C,F,R,I,F,D,1
2,H,L,V,E,A,L,Y,L,1
3,T,Q,I,M,F,E,T,F,1
4,A,E,E,L,A,E,I,F,1


In [29]:
#separate dataset into x_train and y_train
from sklearn.model_selection import train_test_split
train,test=train_test_split(dataset,test_size=0.20,random_state=0)

In [30]:
test.to_csv('test.csv',index=False,encoding='utf-8')

In [31]:
#split the train data into train and cross validation
train_data,cv_data=train_test_split(train,test_size=0.20,random_state=0)

In [32]:
train_data.shape,cv_data.shape

((1040, 9), (260, 9))

In [33]:
x_train=train_data.iloc[:,:-1].values
y_train=train_data.iloc[:,8].values

In [34]:
x_train.shape,y_train.shape

((1040, 8), (1040,))

In [35]:
x_cv=cv_data.iloc[:,:-1].values
y_cv=cv_data.iloc[:,8].values

In [36]:
x_train = pd.DataFrame(x_train)
type(x_train)

pandas.core.frame.DataFrame

In [37]:
x_cv.shape,y_cv.shape

((260, 8), (260,))

In [38]:
x_cv=pd.DataFrame(x_cv)

In [39]:
from sklearn.impute import SimpleImputer 
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
class KNN:

  
    def onehotcode(self,x_data):
        # from collections import defaultdict
        d = defaultdict(LabelEncoder)
        # Encoding the variable
        fit = x_data.apply(lambda x: d[x.name].fit_transform(x))
        # Inverse the encoded
        fit.apply(lambda x: d[x.name].inverse_transform(x))
        # Using the dictionary to label future data
        x_data.apply(lambda x: d[x.name].transform(x))
        one_hot_encode = OneHotEncoder()
        one_hot_encode.fit(x_data)
        x_data=one_hot_encode.transform(x_data).toarray()
        return x_data
    
    def label_encoding(sel,y_train):
        #Encoding the dependent variable
        labelencoder_y=LabelEncoder()
        # print(labelencoder_y)
        y_train=labelencoder_y.fit_transform(y_train)
        # print(y_train)
        return y_train
    
    def feature_scaling(self,dataset):
        #feature scaling
        sc=StandardScaler()
        x_train=sc.fit_transform(dataset)
        return x_train,sc

In [40]:
obj=KNN()
dataset=obj.onehotcode(x_train)
# print(dataset)
y_train=obj.label_encoding(y_train)
# print("label data on y_train",y_train)
x_train,sc_train=obj.feature_scaling(dataset)
# print(x_train)

In [41]:
x_cv=obj.onehotcode(x_cv)
# print(x_cv)
y_cv=obj.label_encoding(y_cv)
# print("label data on y_cv data",y_cv)
x_cv,sc_cv=obj.feature_scaling(x_cv)
# print(x_train)

In [42]:
#fitting the LogisticRegression to the training set 
from sklearn.neighbors import KNeighborsClassifier
classifier=KNeighborsClassifier(n_neighbors=5,metric='minkowski',p=2)
#that the classifier learns corr between x_train and y_train
classifier.fit(x_train,y_train)

KNeighborsClassifier()

In [43]:
#predicting the train set result
y_pred=classifier.predict(x_train)
dataset=pd.DataFrame({'Actual':y_train,'Predicted':y_pred})
dataset.head()

Unnamed: 0,Actual,Predicted
0,0,0
1,0,1
2,0,1
3,0,0
4,0,1


In [44]:
#predicting the cross validation result
y_predict=classifier.predict(x_cv)
dataset=pd.DataFrame({'Actual':y_cv,'Predicted':y_predict})
dataset.head()

Unnamed: 0,Actual,Predicted
0,1,1
1,0,1
2,0,1
3,0,1
4,0,1


In [45]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix
cm=confusion_matrix(y_train,y_pred)

In [46]:
#making the confusion matrix
from sklearn.metrics import confusion_matrix
cm1=confusion_matrix(y_cv,y_predict)
cm1

array([[143,  69],
       [  5,  43]], dtype=int64)

In [47]:
#check accuracy for X train data
Acc_Train=sklearn.metrics.balanced_accuracy_score(y_pred,y_train)*100
print("accurancy for train data:=",Acc_Train)

accurancy for train data:= 82.42527055641705
