In [185]:
import xgboost
import pandas as pd
import numpy as np
import xgboost as xgb




## Reading Data

In [186]:
TRAINING_DATA_URL = "https://s3.amazonaws.com/dna-mlapp/training/training_data.csv"
TESTING_DATA_URL = "https://s3.amazonaws.com/dna-mlapp/training/test_data.csv"

In [187]:
train_data=pd.read_csv(TRAINING_DATA_URL)

In [188]:
#sneak peak of training data
train_data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,0,148,6.5,3.0,5.2,2.0,Iris-virginica
1,1,72,6.1,2.8,4.0,1.3,Iris-versicolor
2,2,119,7.7,2.6,6.9,2.3,Iris-virginica
3,3,115,5.8,2.8,5.1,2.4,Iris-virginica
4,4,87,6.7,3.1,4.7,1.5,Iris-versicolor


In [189]:
test_data = pd.read_csv(TESTING_DATA_URL)


In [190]:
#sneak peak of test data
test_data.head()

Unnamed: 0.1,Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,120,135,6.1,2.6,5.6,1.4,Iris-virginica
1,121,31,4.8,3.1,1.6,0.2,Iris-setosa
2,122,143,5.8,2.7,5.1,1.9,Iris-virginica
3,123,137,6.3,3.4,5.6,2.4,Iris-virginica
4,124,29,5.2,3.4,1.4,0.2,Iris-setosa


In [191]:
"There are {} examples in train dataset and {} examples in test dataset.The number of prediction catagories is {}".format((train_data.shape[0]),(test_data.shape[0]),(train_data["Species"].nunique()))

'There are 120 examples in train dataset and 30 examples in test dataset.The number of prediction catagories is 3'

## PreProcessing Data

In [192]:
#training and test labels
combined_labels = pd.factorize(train_data["Species"].append(test_data["Species"]))[0]
combined_labels
y_train = combined_labels[:train_data.shape[0]]
y_test = combined_labels[train_data.shape[0]:]

In [193]:
#train labels
y_train

array([0, 1, 0, 0, 1, 0, 2, 0, 0, 2, 2, 1, 2, 1, 1, 2, 0, 0, 1, 2, 2, 2,
       0, 1, 2, 0, 2, 2, 0, 0, 0, 2, 2, 1, 2, 2, 1, 1, 0, 1, 2, 2, 0, 1,
       0, 2, 1, 0, 1, 2, 0, 0, 2, 0, 1, 1, 0, 0, 0, 2, 1, 1, 0, 0, 0, 2,
       1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 2, 0, 1, 2, 2, 2, 2, 0, 0,
       1, 1, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 1, 1, 1, 2, 0, 0, 1, 1, 1,
       2, 1, 2, 1, 1, 1, 2, 0, 2, 1])

In [194]:
#test labels
y_test

array([0, 2, 0, 0, 2, 0, 1, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2,
       1, 1, 1, 2, 2, 2, 1, 0])

In [195]:
#dropping labels from train and test data
train_data.drop(["Species"],axis=1,inplace=True)
test_data.drop(["Species"],axis=1,inplace=True)

In [196]:
#converting dataframe into numpy matrix/ndarray
X_train = train_data.values

In [197]:
#converting dataframe into numpy matrix/ndarray
X_test = test_data.values

In [198]:
#Converting train/test data into DMatrix,a native form of xgboost that facilitates faster computation.

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

# Defining training parameters

In [199]:
#parameters for xgboost training.These are chosen with regard to dataset size and have to be tuned for optimal performance.
param = {
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 3}  # the number of classes that exist in this datset
num_round = 20  # the number of training iterations

## Training

In [200]:
bst = xgb.train(param, dtrain, num_round)

## Prediction

In [201]:
preds = bst.predict(dtest)
#Predictions are in form of probabilities
preds

array([[0.98642343, 0.00873516, 0.00484139],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.98642343, 0.00873516, 0.00484139],
       [0.9881281 , 0.00702207, 0.00484976],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.9881281 , 0.00702207, 0.00484976],
       [0.00712454, 0.9859491 , 0.00692637],
       [0.00573335, 0.98869276, 0.00557387],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.00820885, 0.98381066, 0.00798052],
       [0.00523954, 0.00960699, 0.98515344],
       [0.9881281 , 0.00702207, 0.00484976],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.9881281 , 0.00702207, 0.00484976],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.9881281 , 0.00702207, 0.00484976],
       [0.98642343, 0.00873516, 0.00484139],
       [0.0052495 , 0.00772425, 0.9870263 ],
       [0.

In [202]:
#picking the best/highest probability
best_preds = np.asarray([np.argmax(line) for line in preds])
best_preds

array([0, 2, 0, 0, 2, 0, 1, 1, 2, 2, 1, 2, 0, 2, 0, 2, 2, 2, 2, 0, 0, 2,
       1, 1, 1, 2, 2, 2, 1, 0])

## Accuracy

In [203]:
#Accuracy
accuracy = np.sum(y_test == best_preds)/len(best_preds)
accuracy

1.0