In [13]:
'''
Authors: Balachander S, Prahalad Srinivas C G, Yogesh Chandra Singh Samant, B Varshin Hariharan
'''
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

#import scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

class FeatureClassifier:
  def __init__(self,reqAcc=0.01,classifier='DesicionTree',bias=[],control=None,n_jobs=None,random_state=None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)
    print(len(x[0]))
    print(x[0])
    print(x)
    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred

In [14]:
#import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

#import and read data
from sklearn.datasets import load_iris
iris = datasets.load_iris()

#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.3, random_state=109)

#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3)
clf1.fit(X_train,y_train.reshape(-1,1))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(y_test,clf1.predict(X_test)))

print(clf1.indexLs)
clf1.featureClassifiers

4
[6.5 3.  5.8 2.2]
[[6.5 3.  5.8 2.2]
 [7.6 3.  6.6 2.1]
 [5.9 3.  4.2 1.5]
 [5.1 3.8 1.6 0.2]
 [6.3 3.3 6.  2.5]
 [4.6 3.6 1.  0.2]
 [7.3 2.9 6.3 1.8]
 [5.7 3.8 1.7 0.3]
 [5.2 2.7 3.9 1.4]
 [6.3 2.5 5.  1.9]
 [6.  2.2 5.  1.5]
 [4.8 3.1 1.6 0.2]
 [6.7 3.3 5.7 2.1]
 [5.5 2.4 3.8 1.1]
 [4.3 3.  1.1 0.1]
 [4.6 3.1 1.5 0.2]
 [6.2 2.2 4.5 1.5]
 [7.2 3.2 6.  1.8]
 [6.9 3.1 5.4 2.1]
 [5.8 2.8 5.1 2.4]
 [5.6 2.7 4.2 1.3]
 [5.4 3.  4.5 1.5]
 [5.8 2.7 4.1 1. ]
 [5.  2.3 3.3 1. ]
 [5.7 2.9 4.2 1.3]
 [6.4 2.8 5.6 2.2]
 [6.2 2.9 4.3 1.3]
 [5.2 4.1 1.5 0.1]
 [5.1 2.5 3.  1.1]
 [5.4 3.7 1.5 0.2]
 [6.8 2.8 4.8 1.4]
 [7.9 3.8 6.4 2. ]
 [4.9 2.4 3.3 1. ]
 [6.4 3.1 5.5 1.8]
 [5.4 3.9 1.7 0.4]
 [6.  3.4 4.5 1.6]
 [5.1 3.4 1.5 0.2]
 [6.7 3.  5.  1.7]
 [6.5 3.2 5.1 2. ]
 [6.  2.9 4.5 1.5]
 [6.1 3.  4.9 1.8]
 [6.8 3.  5.5 2.1]
 [5.1 3.5 1.4 0.3]
 [5.1 3.5 1.4 0.2]
 [5.7 2.5 5.  2. ]
 [6.4 2.7 5.3 1.9]
 [4.8 3.4 1.9 0.2]
 [5.  3.5 1.6 0.6]
 [5.  3.  1.6 0.2]
 [6.1 2.8 4.  1.3]
 [5.  3.3 1.4 0.2]
 [6.3 2.9 5

[(DecisionTreeClassifier(max_depth=3), 89.95458378677623, 2),
 (DecisionTreeClassifier(max_depth=3), 50.45132845021668, 3),
 (DecisionTreeClassifier(max_depth=3), 9.576129321593516, 0),
 (DecisionTreeClassifier(max_depth=3), 2.686082466827127, 1)]

In [6]:
print(FeatureClassifier(0,classifier='DecisionTree',control=3))

<__main__.FeatureClassifier object at 0x00000207F222D7D0>
