### Importing Packages

In [176]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
import pandas as pd
from sklearn import svm
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [178]:
#data = np.loadtxt('/Users/elleemortensen/Documents/GitHub/BP24/Ellee/gaussian_large_d_1.tex')
#data = np.loadtxt('/Users/elleemortensen/Documents/GitHub/BP24/Ellee/gaussian_small_d_1.tex')
#data = np.loadtxt('/Users/elleemortensen/Documents/GitHub/BP24/Ellee/uniform_large_d_1.tex')
data = np.loadtxt('/Users/elleemortensen/Documents/GitHub/BP24/Ellee/uniform_small_d_1.tex')

array = np.array(data)
df = pd.DataFrame(array)

### Splitting Data

In [187]:
for i in range(25):
    df.iloc[:,i] = df.iloc[:,i].round()
    df.iloc[:,i] = df.iloc[:,i].astype(int)
    df.iloc[:,i] = df.iloc[:,i].astype("category")
df.iloc[:,150] = df.iloc[:,150].astype("category")

In [189]:
# split the dataset into training and testing data
# test_size: this is the percentage of data used for testing (20% in this case), so the rest is used for training data (80% in this case)
# random_state: this is a random number chosen that should be used each time to ensure we get the same data split each time
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:150], df.iloc[:,-1], test_size = 0.2, random_state = 52)

### XGBoost

In [191]:
# create model instance
# n_estimators: number of trees(estimators) the model uses --> the more used, the more accurate the model is
# max_depth: maximum depth of tree --> higher number makes model more complex, but too high can cause overfitting
# learning_rate: quantifies each tree's contribution to total prediction --> lower number takes longer, but can lead to better generalization
# objective: binary:logistic outputs probabilities. if classification is wanted, use binary:hinge
bst = XGBClassifier(n_estimators = 2, max_depth = 2, learning_rate = 1, objective = 'binary:logistic', enable_categorical = True)

# fit model with the training data
bst.fit(X_train, y_train)

# make predictions for the test dataset
preds = bst.predict(X_test)

# print predictions
print(preds)

# print model Accuracy (how often the classifier is correct)
print("Accuracy:", metrics.accuracy_score(y_test, preds))

[0 1 1 0 1 1 1 0 1 0 1 0 0 0 1 1 1 0 0 1 1 1 1 1 1 0 0 1 0 0 1 1 0 0 1 1 1
 0 0 0 1 1 1 1 0 1 1 0 0 0 1 0 1 1 1 1 1 0 0 0 1 1 1 1 1 1 1 1 0 1 1 0 1 0
 0 1 0 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1 0]
Accuracy: 0.81


### FIC

In [12]:
class FeatureClassifier:
  def __init__(self,reqAcc = 0.01, classifier = 'DecisionTree', bias = [], control = None, n_jobs = None, random_state = None):
    self.featureClassifiers=[] #list of all the classifiers of all the selected features
    self.reqAcc=reqAcc #user specified cutoff value
    self.indexLs=[] # list of mapped index values to featureClassifiers
    self.flag=0
    self.bias=bias # list of biases for each and every label
    self.control=control #overfitting control for decision trees
    self.classifier=classifier #the classifier which is preferred by the user
    self.dic={'DecisionTree':0,'LinearRegression':1,'SVM':2,'LogisticRegression':3} #a dictionary which maps the classifier to its index
    self.n_jobs=n_jobs
    self.random_state=random_state
    self.num_lables = None

  def finIndex(self):
    #finds the index where the reqAcc condition fails and also created the indexLs[] for mapping
    for i in range(len(self.featureClassifiers)):
      if self.featureClassifiers[i][1] < self.reqAcc:
        return i
      self.indexLs.append(self.featureClassifiers[i][2])
    self.flag=1
    return i

  def fit(self,x,y):
    #applied the model to the dataset. The model is trained and saved for further prediction
    self.num_lables=len(set(y.flatten()))
    bestfeatures = SelectKBest(score_func=chi2,k=1)
    fit = bestfeatures.fit(x,y)

    for i in range(len(x[0])):
      clf=[DecisionTreeClassifier(max_depth=self.control,random_state=self.random_state),LinearRegression(n_jobs=self.n_jobs),SVC(gamma=self.control,random_state=self.random_state), LogisticRegression(penalty=self.control,random_state=self.random_state)][self.dic[self.classifier]]
      X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33,random_state=self.random_state)
      clf.fit(X_train[:,i:i+1],y_train)
      self.featureClassifiers.append((clf,fit.scores_[i],i))
    self.featureClassifiers.sort(key=lambda x:x[1],reverse=True)
    index=self.finIndex()
    if self.flag==0:
      self.featureClassifiers=self.featureClassifiers[:index]
    return

  def predict(self,x):
    #given a list of inputs, predicts the possible outputs
    if not self.bias:
      self.bias=np.zeros(self.num_lables)
    if len(self.bias)<self.num_lables:
      raise AttributeError('Please check the lenth of bias list')
    yPred=[]
    for i in range(len(x)):
      pred_arr=np.zeros(self.num_lables)
      for j in range(len(self.indexLs)):
        pred=np.round(self.featureClassifiers[j][0].predict([[x[i][self.indexLs[j]]]]))
        pred_arr[pred]+=self.featureClassifiers[j][1]+self.bias[pred[0]]
      yPred.append(np.argmax(pred_arr))
    return yPred

In [14]:
#train the model using the training sets
clf1=FeatureClassifier(0,classifier='DecisionTree',control=3)
# clf1.fit(X_train,y_train.reshape(-1,1))
clf1.fit(np.array(X_train), np.array(y_train)[:,np.newaxis].astype(int))

#predict the response for the test dataset
#model accuracy (how often the classifier is correct)
print("Accuracy:", metrics.accuracy_score(np.array(y_test).astype(int),clf1.predict(np.array(X_test))))

print(clf1.indexLs)
clf1.featureClassifiers

Accuracy: 1.0
[136, 102, 99, 12, 13, 139, 103, 109, 129, 112, 94, 58, 4, 93, 137, 47, 134, 14, 29, 120, 0, 42, 54, 122, 65, 121, 64, 124, 127, 131, 67, 123, 148, 130, 145, 59, 111, 143, 57, 144, 1, 133, 52, 126, 43, 20, 38, 24, 117, 39, 48, 25, 7, 128, 45, 113, 75, 106, 110, 60, 90, 104, 35, 119, 31, 77, 62, 83, 5, 89, 66, 142, 107, 114, 21, 56, 41, 73, 88, 76, 138, 86, 37, 96, 23, 2, 6, 51, 28, 44, 69, 19, 108, 80, 22, 33, 100, 18, 30, 97, 71, 68, 50, 61, 118, 141, 36, 63, 16, 98, 105, 27, 11, 26, 79, 92, 74, 125, 115, 53, 10, 85, 78, 34, 84, 32, 17, 8, 81, 82, 140, 46, 40, 95, 101, 132, 9, 49, 70, 135, 3, 147, 15, 91, 116, 87, 55, 146, 72]


[(DecisionTreeClassifier(max_depth=3), 30.90274402045231, 136),
 (DecisionTreeClassifier(max_depth=3), 29.082777549964945, 102),
 (DecisionTreeClassifier(max_depth=3), 27.226682233472232, 99),
 (DecisionTreeClassifier(max_depth=3), 26.69072362931, 12),
 (DecisionTreeClassifier(max_depth=3), 26.369190478602633, 13),
 (DecisionTreeClassifier(max_depth=3), 25.036212102731447, 139),
 (DecisionTreeClassifier(max_depth=3), 24.923520090798064, 103),
 (DecisionTreeClassifier(max_depth=3), 24.897947036285835, 109),
 (DecisionTreeClassifier(max_depth=3), 24.79824628055551, 129),
 (DecisionTreeClassifier(max_depth=3), 24.348075129239582, 112),
 (DecisionTreeClassifier(max_depth=3), 24.34662049797147, 94),
 (DecisionTreeClassifier(max_depth=3), 23.548718797867494, 58),
 (DecisionTreeClassifier(max_depth=3), 23.46422495740002, 4),
 (DecisionTreeClassifier(max_depth=3), 23.317780150976425, 93),
 (DecisionTreeClassifier(max_depth=3), 22.94923129205304, 137),
 (DecisionTreeClassifier(max_depth=3), 22.

### KNN

In [16]:
#create a knn classifier
#n_neighbors: predicting the label of the data point by looking at the 3 closest data points and getting them to "vote"
#algorithm: we may need to look at this if it misbehaves
neigh = KNeighborsClassifier(n_neighbors=5)

#train the model using the training sets
neigh.fit(X_train, y_train)

#predict the response for the test dataset
y_pred = neigh.predict(X_test)

#print predictions
print(y_pred)

#model accuracy (how often the classifier is correct)
print("Accuracy:", metrics.accuracy_score(y_test,y_pred))

[0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1.
 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 0. 1.]
Accuracy: 1.0


### SVM

In [18]:
#Create a svm Classifier
# kernel: options for kernel include linear, poly, rbf, sigmoid
    # linear: use this when data can be split by a linear function
    # poly (polynomial): use this when data can be split by a polynomial function
    # rbf (radial basis function): use this when there are clusters of one class inside another
    # sigmoid: use this when the split between classes is curved and irregular
clf = svm.SVC(kernel='linear')

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# print predictions
print(y_pred)

# print model Accuracy (how often the classifier is correct)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

[0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1.
 0. 0. 1. 0. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1.
 0. 1. 1. 0. 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 1. 0.
 1. 0. 0. 1.]
Accuracy: 1.0
