# Boosting

In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import SGDClassifier as LR
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.mixture import GMM
from sklearn.lda import LDA
from sklearn.tree import DecisionTreeClassifier
from sklearn.cross_validation import cross_val_score
from numpy.random import multivariate_normal
from mpl_toolkits.mplot3d import Axes3D


%matplotlib inline

In [31]:
# %load loadData.py

def load(synthetic=False):
  if (not synthetic):
    train = pd.read_csv('train/X_train.txt', header=None, delim_whitespace=True)
    test = pd.read_csv('test/X_test.txt', header=None, delim_whitespace=True)

    label = pd.read_csv('train/y_train.txt', header=None, delim_whitespace=True)
    test_label = pd.read_csv('test/y_test.txt', header=None, delim_whitespace=True)

    return train, label, test, test_label

  mu1 = np.array([0, 1])
  mu2 = np.array([-0.5, -1.0])
  mu3 = np.array([3.2, .6])
  mu4 = np.array([3, -1])

  s1 = np.matrix([[1, 0.1], [0.1, 0.1]])
  s2 = np.matrix([[0.3, 0.2], [0.2, 0.4]])
  s3 = np.matrix([[0.5, 0.01], [0.01, 0.1]])
  s4 = np.matrix([[0.5, -0.2], [-0.2, 0.2]])

  d1 = multivariate_normal(mu1, s1, 500)
  d2 = multivariate_normal(mu2, s2, 500)
  d3 = multivariate_normal(mu3, s3, 500)
  d4 = multivariate_normal(mu4, s4, 500)
  syntheticTrain = np.vstack((d1,d2,d3,d4))
  syntheticTrainLabel = np.hstack(([1] * 500, [2] * 500, [3] * 500, [4] * 500))


  t1 = multivariate_normal(mu1, s1, 500)
  t2 = multivariate_normal(mu2, s2, 500)
  t3 = multivariate_normal(mu3, s3, 500)
  t4 = multivariate_normal(mu4, s4, 500)
  syntheticTest = np.vstack((t1,t2,t3,t4))
  syntheticTestLabel = np.hstack(([1] * 500, [2] * 500, [3] * 500, [4] * 500))

  fig, (ax1) = plt.subplots(1,1)

  ax1.scatter(d1[:,0], d1[:,1], color='red')
  ax1.scatter(d2[:,0], d2[:,1], color='blue')
  ax1.scatter(d3[:,0], d3[:,1], color='green')
  ax1.scatter(d4[:,0], d4[:,1], color='black')

  return syntheticTrain, syntheticTrainLabel, syntheticTest, syntheticTestLabel


In [32]:
def adaBoostTrain(trainData, trainLabels, nClassifiers = 20):
  nTrain = trainData.shape[0]
  alphas = np.zeros(nClassifiers)
  dataWeights = np.ones((nTrain, nClassifiers)) / float(nTrain)
  models = []

  for iteration in xrange(nClassifiers):
    #model = LR()
    model = DecisionTreeClassifier(max_depth=2)
    model.fit(trainData, np.array(trainLabels).ravel(), sample_weight=dataWeights[:, iteration]*nTrain)
    models.append(model)
    predict = model.predict(trainData)
    incorrect = predict != np.array(trainLabels).ravel()
    correct = np.invert(incorrect)
    weightedErrors = (incorrect) * dataWeights[:,iteration]
    errorRate = np.sum(weightedErrors)
    alphas[iteration] = np.log((1 - errorRate) / errorRate) + np.log(5)
    if (iteration < nClassifiers - 1):
      dataWeights[:, iteration + 1] = dataWeights[:, iteration] * np.exp(alphas[iteration] * incorrect)
      dataWeights[:, iteration + 1] /= np.sum(dataWeights[:, iteration + 1])

  return models, alphas

def adaBoostEvaluate(models, alphas, data, labels):
  # There are 6 possible classes. We need to keep a tally
  # For each class for each data point
  tallies = np.zeros((data.shape[0], 6))
  for i in xrange(len(alphas)):
      modelPredictions = models[i].predict(data)

      for j in xrange(len(modelPredictions)):
          prediction = modelPredictions[j]
          tallies[j, prediction - 1] += alphas[i]

  predictions = np.add(np.argmax(tallies, axis=1), 1)
  errors = (predictions != np.array(labels).ravel())
  return str(100 * np.mean(errors))



In [33]:
trainData, trainLabels, testData, testLabels = load()

lda = LDA(solver="svd", store_covariance=True)
#pca = PCA(n_components=3)
#X = pca.inverse_transform(pca.fit_transform(trainData))
trainData = lda.fit_transform(trainData, trainLabels)
print trainData.shape
testData = lda.transform(testData)
    
models, alphas = adaBoostTrain(trainData, trainLabels)

trainError = adaBoostEvaluate(models, alphas, trainData, trainLabels)
print "Training Error: " + trainError

testError = adaBoostEvaluate(models, alphas, testData, testLabels)
print "Test Error: " + testError

(7352, 5)
Training Error: 1.61860718172
Test Error: 3.80047505938


# Bagging

In [34]:
trainData, trainLabels, testData, testLabels = load()

In [35]:
model = LR()
model.fit(trainData, np.array(trainLabels).ravel())
bagging = BaggingClassifier(model)

In [36]:
bagging.fit(trainData, np.array(trainLabels).ravel())
predict_test = bagging.predict(testData)

In [37]:
errors_test = (predict_test != np.array(testLabels).ravel())
print "Test Error: " + str(100* np.mean(errors_test))

Test Error: 5.25958601968
