In [37]:
'''
Authors: Balachander S, Prahalad Srinivas C G, Yogesh Chandra Singh Samant, B Varshin Hariharan
'''
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

#import scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

class FeatureClassifier:
  def __init__(self,reqAcc=0.01,classifier='DesicionTree',bias=[],control=None,n_jobs=None,random_state=None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)

    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred


In [39]:
#This is what I implemented --Fabi
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
print(df_table)

          0         1         2         3         4         5         6    \
0    2.014037  2.842330  2.093059  2.314322  2.550290  2.556514  2.063987   
1    2.655125  2.439494  2.387897  2.414520  2.677007  2.066587  2.221681   
2    2.397686  2.129261  2.228847  2.574741  2.672454  2.330393  2.379493   
3    0.021023  0.884131  0.570157  0.950007  0.570792  0.741419  0.251829   
4    0.087550  0.596086  0.355909  0.447322  0.680048  0.198563  0.192330   
..        ...       ...       ...       ...       ...       ...       ...   
495  0.611156  0.236036  0.896368  0.773777  0.538057  0.402998  0.090796   
496  2.761173  2.080949  2.939479  2.325925  2.977614  2.109083  2.517269   
497  0.401104  0.340544  0.555580  0.230778  0.600226  0.992868  0.274078   
498  0.248207  0.096274  0.516660  0.946114  0.271408  0.845261  0.546188   
499  2.647101  2.363681  2.077603  2.632778  2.676110  2.920187  2.866320   

          7         8         9    ...       141       142       143  \
0  

In [41]:
#import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

#import and read data
#from sklearn.datasets import load_iris
#iris = datasets.load_iris()

#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,1:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)


# Save the data (example: saving as CSV files)
#X_test.to_csv('X_test.csv', index=False)
#y_test.to_csv('y_test.csv', index=False)


In [43]:
#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3)
# clf1.fit(X_train,y_train.reshape(-1,1))
clf1.fit(np.array(X_train), np.array(y_train)[:,np.newaxis].astype(int))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(np.array(y_test).astype(int),clf1.predict(np.array(X_test))))

print(clf1.indexLs)
clf1.featureClassifiers

Accuracy: 1.0
[84, 80, 10, 14, 2, 42, 54, 146, 136, 82, 28, 58, 123, 87, 92, 107, 140, 41, 72, 132, 51, 111, 148, 135, 27, 26, 47, 94, 108, 90, 109, 115, 55, 9, 0, 116, 137, 142, 19, 12, 128, 43, 104, 89, 110, 22, 112, 118, 6, 131, 74, 60, 88, 102, 15, 37, 130, 79, 124, 48, 114, 120, 117, 70, 17, 129, 18, 101, 103, 31, 119, 78, 44, 3, 93, 53, 138, 76, 67, 33, 125, 35, 61, 65, 106, 5, 97, 100, 20, 64, 36, 8, 143, 40, 49, 66, 71, 99, 121, 29, 105, 39, 59, 45, 77, 23, 95, 7, 50, 144, 134, 63, 122, 52, 145, 91, 147, 68, 75, 86, 85, 133, 141, 56, 32, 1, 38, 96, 4, 24, 57, 34, 62, 30, 25, 21, 16, 11, 69, 139, 83, 126, 127, 98, 73, 113, 81, 13, 46]


[(DecisionTreeClassifier(max_depth=3), 291.0553717510584, 84),
 (DecisionTreeClassifier(max_depth=3), 282.4690133841282, 80),
 (DecisionTreeClassifier(max_depth=3), 282.28267184781237, 10),
 (DecisionTreeClassifier(max_depth=3), 280.3898606667616, 14),
 (DecisionTreeClassifier(max_depth=3), 278.773539630527, 2),
 (DecisionTreeClassifier(max_depth=3), 276.9569014542024, 42),
 (DecisionTreeClassifier(max_depth=3), 276.7148441422138, 54),
 (DecisionTreeClassifier(max_depth=3), 275.88500606823686, 146),
 (DecisionTreeClassifier(max_depth=3), 275.84830358956737, 136),
 (DecisionTreeClassifier(max_depth=3), 275.2023106378025, 82),
 (DecisionTreeClassifier(max_depth=3), 275.2018896623747, 28),
 (DecisionTreeClassifier(max_depth=3), 274.99522313121605, 58),
 (DecisionTreeClassifier(max_depth=3), 274.9165068096032, 123),
 (DecisionTreeClassifier(max_depth=3), 274.28604610699693, 87),
 (DecisionTreeClassifier(max_depth=3), 274.1001039958229, 92),
 (DecisionTreeClassifier(max_depth=3), 273.796715