# FIC


In [19]:
'''
Authors: Balachander S, Prahalad Srinivas C G, Yogesh Chandra Singh Samant, B Varshin Hariharan
'''
import numpy as np
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from sklearn import datasets
import pandas as pd

#import scikit learn packages
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

class FeatureClassifier:
  def __init__(self,reqAcc=0.01,classifier='DesicionTree',bias=[],control=None,n_jobs=None,random_state=None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)

    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred


In [21]:
#This is what I implemented --Fabi
#load dataset
data = np.loadtxt("uniform_large_d_1.tex")

# Creating NumPy array
array = np.array(data)

# Converting to Pandas DataFrame
df_table = pd.DataFrame(array)

# Displaying the table
#print(df_table)

In [23]:
# From the dataset, change 25 columns to 'categorical'
#Loop, converts floats to ints and then those ints to category
for i in range(26):
    df_table.iloc[:,i] = df_table.iloc[:,i].round()
    df_table.iloc[:,i] = df_table.iloc[:,i].astype(int)
    df_table.iloc[:,i] = df_table.iloc[:,i].astype("category")

df_table.iloc[:, 150] = df_table.iloc[:, 150].astype("category")

df_table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,141,142,143,144,145,146,147,148,149,150
0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,...,0.660629,1.325968,1.282151,0.60055,0.592177,0.776711,1.085891,1.153748,1.352572,1.0
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.892705,0.848612,1.298801,1.250497,0.547771,1.215082,0.940952,1.109552,1.181372,1.0
2,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1.339399,0.417466,0.496915,0.661756,0.875185,1.293924,0.750581,0.742218,0.993983,1.0
3,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.086188,0.394613,0.252668,0.808593,0.587922,0.827502,0.862651,0.684517,0.149873,0.0
4,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,...,0.816635,0.31988,0.770176,0.919029,0.265299,0.983398,0.956898,0.175083,0.170124,0.0


In [25]:
#split dataset into training set and test set
#test_size: in this case it is 70% training and 30% testing
#random_state: sets a seed for a random number generator that splits the data
X_train, X_test, y_train, y_test = train_test_split(df_table.iloc[:,0:150], df_table.iloc[:,-1], test_size=0.2, random_state=52)

In [27]:
#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3)
# clf1.fit(X_train,y_train.reshape(-1,1))
clf1.fit(np.array(X_train), np.array(y_train)[:,np.newaxis].astype(int))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:",metrics.accuracy_score(np.array(y_test).astype(int),clf1.predict(np.array(X_test))))

print(clf1.indexLs)
clf1.featureClassifiers

Accuracy: 1.0
[0, 2, 120, 25, 137, 85, 100, 13, 15, 34, 73, 39, 106, 8, 65, 90, 91, 55, 97, 61, 149, 126, 99, 49, 47, 76, 42, 102, 67, 14, 124, 127, 60, 138, 28, 3, 114, 29, 1, 112, 125, 16, 88, 98, 146, 33, 117, 105, 72, 140, 143, 122, 24, 83, 115, 95, 107, 59, 52, 80, 36, 63, 64, 111, 37, 44, 101, 27, 103, 121, 130, 20, 129, 135, 69, 5, 62, 139, 147, 31, 113, 68, 142, 17, 48, 104, 92, 79, 94, 109, 9, 18, 89, 11, 82, 118, 54, 35, 141, 6, 75, 116, 119, 74, 40, 53, 41, 70, 93, 77, 32, 148, 26, 71, 132, 133, 51, 23, 43, 86, 19, 87, 45, 21, 56, 84, 145, 22, 57, 78, 46, 7, 66, 50, 30, 110, 81, 12, 96, 134, 136, 128, 4, 58, 38, 144, 108, 131, 123, 10]


[(DecisionTreeClassifier(max_depth=3), 35.3826714801444, 0),
 (DecisionTreeClassifier(max_depth=3), 31.582089552238806, 2),
 (DecisionTreeClassifier(max_depth=3), 30.64785141254948, 120),
 (DecisionTreeClassifier(max_depth=3), 30.488188976377952, 25),
 (DecisionTreeClassifier(max_depth=3), 29.7060595004451, 137),
 (DecisionTreeClassifier(max_depth=3), 29.361849176904997, 85),
 (DecisionTreeClassifier(max_depth=3), 29.176838266946696, 100),
 (DecisionTreeClassifier(max_depth=3), 29.136690647482013, 13),
 (DecisionTreeClassifier(max_depth=3), 29.136690647482013, 15),
 (DecisionTreeClassifier(max_depth=3), 28.4108943265348, 34),
 (DecisionTreeClassifier(max_depth=3), 28.345919382078073, 73),
 (DecisionTreeClassifier(max_depth=3), 28.257501104288405, 39),
 (DecisionTreeClassifier(max_depth=3), 27.70165781193473, 106),
 (DecisionTreeClassifier(max_depth=3), 27.65714285714286, 8),
 (DecisionTreeClassifier(max_depth=3), 27.351735160862674, 65),
 (DecisionTreeClassifier(max_depth=3), 27.117955