In [8]:
'''
Authors: Balachander S, Prahalad Srinivas C G, Yogesh Chandra Singh Samant, B Varshin Hariharan
'''
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

#import scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

class FeatureClassifier:
  def __init__(self,reqAcc=0.01,classifier='DesicionTree',bias=[],control=None,n_jobs=None,random_state=None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)

    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred


In [10]:
#This is what I implemented --Fabi
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [12]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,2.0,3.0,2.0,2.0,3.0,3.0,2.0,3.0,2.0,3.0,...,2.646452,2.997843,2.54626,2.423437,2.825879,2.566278,2.403595,2.396183,2.684211,1.0
1,3.0,2.0,2.0,2.0,3.0,2.0,2.0,3.0,3.0,3.0,...,2.024768,2.424598,2.349128,2.325874,2.524994,2.862275,2.060383,2.505475,2.334364,1.0
2,2.0,2.0,2.0,3.0,3.0,2.0,2.0,2.0,2.0,3.0,...,2.625961,2.962051,2.420763,2.411974,2.361735,2.667143,2.073825,2.388143,2.831569,1.0
3,0.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,...,0.864361,0.083862,0.616211,0.898456,0.117597,0.664931,0.813385,0.573604,0.117329,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.455617,0.51657,0.956458,0.97284,0.550108,0.503513,0.337278,0.735706,0.284006,0.0


In [14]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [16]:
#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3)
# clf1.fit(X_train,y_train.reshape(-1,1))
clf1.fit(np.array(X_train), np.array(y_train)[:,np.newaxis].astype(int))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(np.array(y_test).astype(int),clf1.predict(np.array(X_test))))

print(clf1.indexLs)
clf1.featureClassifiers

Accuracy: 1.0
[11, 85, 15, 81, 3, 23, 13, 43, 55, 1, 147, 137, 83, 29, 59, 124, 88, 93, 108, 0, 141, 42, 73, 133, 52, 112, 149, 136, 28, 27, 48, 95, 109, 91, 110, 116, 56, 117, 20, 138, 143, 7, 129, 44, 105, 90, 111, 113, 119, 132, 75, 19, 61, 89, 103, 38, 131, 80, 125, 49, 115, 121, 6, 118, 71, 130, 102, 104, 32, 120, 79, 45, 16, 94, 54, 139, 77, 68, 34, 126, 25, 36, 62, 66, 107, 98, 101, 65, 37, 144, 41, 50, 4, 9, 21, 67, 72, 100, 122, 30, 106, 18, 40, 60, 46, 78, 10, 96, 51, 145, 135, 64, 123, 53, 146, 92, 148, 69, 76, 87, 86, 134, 142, 57, 33, 39, 97, 58, 24, 35, 63, 31, 26, 70, 140, 84, 127, 128, 99, 74, 2, 114, 82, 22, 12, 5, 8, 47, 17, 14]


[(DecisionTreeClassifier(max_depth=3), 294.14355321629307, 11),
 (DecisionTreeClassifier(max_depth=3), 291.0553717510584, 85),
 (DecisionTreeClassifier(max_depth=3), 286.7686810697876, 15),
 (DecisionTreeClassifier(max_depth=3), 282.4690133841282, 81),
 (DecisionTreeClassifier(max_depth=3), 281.41787370587656, 3),
 (DecisionTreeClassifier(max_depth=3), 280.3528898344379, 23),
 (DecisionTreeClassifier(max_depth=3), 277.53767152259604, 13),
 (DecisionTreeClassifier(max_depth=3), 276.9569014542024, 43),
 (DecisionTreeClassifier(max_depth=3), 276.7148441422138, 55),
 (DecisionTreeClassifier(max_depth=3), 276.58381881593834, 1),
 (DecisionTreeClassifier(max_depth=3), 275.88500606823686, 147),
 (DecisionTreeClassifier(max_depth=3), 275.84830358956737, 137),
 (DecisionTreeClassifier(max_depth=3), 275.2023106378025, 83),
 (DecisionTreeClassifier(max_depth=3), 275.2018896623747, 29),
 (DecisionTreeClassifier(max_depth=3), 274.99522313121605, 59),
 (DecisionTreeClassifier(max_depth=3), 274.91650