In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import average_precision_score
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import KFold

In [5]:
class Model:
    def __init__(self,location,numOfFold):
        self.fold = numOfFold
        self.kFold = KFold(numOfFold,True,1)
        self.avg_accuracy = []
        self.data = pd.read_csv(location)
        self.data = self.data.fillna(self.data.mean())
        self.X = self.data.drop('Label',axis=1)
        self.Y = self.data['Label']
        print('X shape:',str(self.X.shape))
        print('Y shape:',str(self.Y.shape))
    def loadData(self,location,numOfFold):
        self.fold = numOfFold
        self.kFold = KFold(numOfFold,True,1)
        self.avg_accuracy = []
        self.data = pd.read_csv(location)
        self.data = self.data.fillna(self.data.mean())
        self.X = self.data.drop('Label',axis=1)
        self.Y = self.data['Label']
        print('X shape:',str(self.X.shape))
        print('Y shape:',str(self.Y.shape))
    def removeContantFeature(self):
        #print('Removing constant feature')
        constant_filter = VarianceThreshold(threshold=0)
        constant_filter.fit(self.X_train)
        #print('Number of constant feature ',constant_filter.get_support().sum())
        constant_list = [not temp for temp in constant_filter.get_support()]
        self.X.columns[constant_list]
        self.X_train_filter = constant_filter.transform(self.X_train)
        self.X_test_filter = constant_filter.transform(self.X_test)
        print('Shape of the dataset after removal of constant features')
        print(self.X_train_filter.shape,self.X_test_filter.shape,self.X_train.shape,'\n')
    def removeQuasiConstant(self):
        #print('Removing Quasi constant feature')
        quasi_constant_filter = VarianceThreshold(threshold = 0.01)
        quasi_constant_filter.fit(self.X_train_filter)
        #print('Number of quasi constant feature ',quasi_constant_filter.get_support().sum())
        self.X_train_quasi_filter = quasi_constant_filter.transform(self.X_train_filter)
        self.X_test_quasi_filter = quasi_constant_filter.transform(self.X_test_filter)
        print('Shape of the dataset after removal of quasi constant features')
        print(self.X_train_quasi_filter.shape,self.X_test_quasi_filter.shape,self.X_train.shape,'\n')
        
    def removeDuplicateFeature(self):
        X_train_T = self.X_train_quasi_filter.T
        X_test_T = self.X_test_quasi_filter.T
        X_train_T = pd.DataFrame(X_train_T)
        X_test_T = pd.DataFrame(X_test_T)
        #print('Number of duplicate feature ',X_train_T.duplicated().sum())
        duplicated_feature = X_train_T.duplicated()
        features_to_keep = [not index for index in duplicated_feature]
        self.X_train_unique = X_train_T[features_to_keep].T
        self.X_test_unique = X_test_T[features_to_keep].T
        print('Shape of the dataset after removal of duplicate features')
        print(self.X_train_unique.shape,self.X_test_unique.shape,self.X_train.shape,'\n')
    def runRandomForest(self):
        count = 1
        for train_index,test_index in self.kFold.split(self.data):
            self.X_train, self.X_test, self.y_train, self.y_test = self.X.iloc[train_index], self.X.iloc[test_index],self.Y.iloc[train_index], self.Y.iloc[test_index]
            #print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
            self.removeContantFeature()
            self.removeQuasiConstant()
            self.removeDuplicateFeature()
            clf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=-1)
            clf.fit(self.X_train_unique, self.y_train)
            self.y_pred = clf.predict(self.X_test_unique)
            accuracy = accuracy_score(self.y_test, self.y_pred)*100
            #print('Accuracy of fold ',str(count),': ',accuracy)
            self.avg_accuracy.append(accuracy)
            count = count+1
        accDF = pd.DataFrame(self.avg_accuracy,columns = ['Accuracy per fold'],index = None)
        print(accDF)
        print('Average accuracy of Random forest ', sum(self.avg_accuracy)/self.fold)
            
        return
    def runSVM(kernelTrick):
        count = 1
        scaler = StandardScaler()
        for train_index,test_index in self.kFold.split(self.data):
            self.X_train, self.X_test, self.y_train, self.y_test = self.X.iloc[train_index], self.X.iloc[test_index],self.Y.iloc[train_index], self.Y.iloc[test_index]
            #print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
            self.removeContantFeature()
            self.removeQuasiConstant()
            self.removeDuplicateFeature()
            X_train_scaled = scaler.fit_transform(self.X_train_unique)
            X_test_scaled = scaler.fit_transform(self.X_test_unique)
            clf = SVC(kernel = kernelTrick , C = 1)
            clf.fit(self.X_train_scaled, self.y_train)
            self.y_pred = clf.predict(self.X_test_scaled)
            accuracy = accuracy_score(self.y_test, self.y_pred)*100
            print('Accuracy of fold ',str(count),': ',accuracy)
            self.avg_accuracy.append(accuracy)
            count = count+1
        accDF = pd.DataFrame(self.avg_accuracy,columns = ['Accuracy per fold'],index = None)
        print(accDF)
        print('Average accuracy of SVM with',kernelTrick,' : ', sum(self.avg_accuracy)/self.fold)
    def runDecisionTree(self,Criterion):
        count = 1
        for train_index,test_index in self.kFold.split(self.data):
            self.X_train, self.X_test, self.y_train, self.y_test = self.X.iloc[train_index], self.X.iloc[test_index],self.Y.iloc[train_index], self.Y.iloc[test_index]
            #print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)
            self.removeContantFeature()
            self.removeQuasiConstant()
            self.removeDuplicateFeature()
            clf = DecisionTreeClassifier(criterion = Criterion, random_state = 100,
                               max_depth=30, min_samples_leaf=5)
            clf.fit(self.X_train_unique, self.y_train)
            self.y_pred = clf.predict(self.X_test_unique)
            accuracy = accuracy_score(self.y_test, self.y_pred)*100
            #print('Accuracy of fold ',str(count),': ',accuracy)
            self.avg_accuracy.append(accuracy)
            count = count+1
        accDF = pd.DataFrame(self.avg_accuracy,columns = ['Accuracy per fold'],index = None)
        print(accDF)
        print('Average accuracy of Decision Tree with ',Criterion,' as criterion: ', sum(self.avg_accuracy)/self.fold)
    def showData(self):
        return self.data.head()

In [10]:
location = r'/home/mirsahib/Desktop/Project-Andromeda/Dataset/1st_Level_Feature_Extracted/1st_Level_Dataset.csv'
FilterModel = Model(location,5)

X shape: (2835, 12)
Y shape: (2835,)


In [7]:
FilterModel.runRandomForest()

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 

In [9]:
FilterModel.runDecisionTree('gini')

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 

In [11]:
FilterModel.runDecisionTree('entropy')

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of duplicate features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of constant features
(2268, 12) (567, 12) (2268, 12) 

Shape of the dataset after removal of quasi constant features
(2268, 12) (567, 