In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy
import sklearn as sk
from __future__ import division
from sklearn.cross_validation import train_test_split



In [2]:
df = pd.read_excel('./BioCode for Machine Learning.xlsx')

In [3]:
#Average similarity. One of the features used in Jeremy's original classifier
avsim = df['avg_Similarity']
#Classifications of species
cls = df['Classification']
#branch distance; also used by jeremy, apparently one of the better predictors
bdist = df['Distance_of_Branch']

## Simple Regression (one label at a time)

In [4]:
#How many of the classifications are known
print((cls.fillna(0) != 0).sum()/cls.size)

0.345294238261


In [5]:
#Samples where there are no NaNs
valid_samples = np.array((cls.fillna(0) != 0) * (1 - bdist.isnull()) * (1 - avsim.isnull())).astype(np.bool)

In [6]:
#How many samples are usable
valid_samples.sum(), avsim.size, valid_samples.sum()/avsim.size

(1684, 4877, 0.34529423826122618)

In [7]:
# Turning 'Indigenous' to 1, and others to 0
cls_binary = (cls == 'Indigenous')*1

In [8]:
#filter
avsim_valid, cls_valid, bdist_valid = avsim[valid_samples], cls_binary[valid_samples], bdist[valid_samples]

In [9]:
#Split into test/train
test_train_ratio = 0.9
avs_train, avs_test, cls_train, cls_test, bdist_train, bdist_test = train_test_split(avsim_valid, cls_valid, bdist_valid, test_size=test_train_ratio)

In [10]:
#Just to get an idea of test/train sizes:
avs_train.size, avs_test.size

(168, 1516)

In [11]:
from sklearn.linear_model import LogisticRegression,ElasticNet,Lasso,LinearRegression
from sklearn.metrics import zero_one_loss, mean_squared_error, mean_squared_log_error

In [12]:
clf = LinearRegression()

In [13]:
X = np.array([np.array(avs_train), np.array(cls_train)]).T
y = cls_train
X_test = np.array([np.array(avs_test), np.array(cls_test)]).T

In [14]:
# Making sure that class sizes are similar in test/train sets
cls_train.sum()/cls_train.size, cls_test.sum()/cls_test.size

(0.14880952380952381, 0.16820580474934038)

In [15]:
#LinReg
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1 #Threshold
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 1.0
Weights: [  1.51618023e-18   1.00000000e+00]


In [16]:
#Seems like branch distance is a much better predictor than avg sim. this matches Jeremy's observation

In [17]:
#LogReg
clf = LogisticRegression()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 1.0
Weights: [[-0.02180076  4.76811675]]


In [18]:
#These predictions look impressive until you realize that it is possible to get 83% By just predicting 0's.

In [19]:
print('Prediciton accuracy:', np.mean((cls_test == 0)))

Prediciton accuracy: 0.831794195251


In [20]:
#What about other labels?

In [19]:
clf = Lasso()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 0.839920948617
Weights: [ 0.  0.]


In [20]:
clf = ElasticNet()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 0.839920948617
Weights: [  8.37147108e-05   0.00000000e+00]


In [21]:
#invasive

In [22]:
# Turning 'Invasive' to 1, and others to 0
cls_binary = (cls == 'Invasive')*1
avsim_valid, cls_valid, bdist_valid = avsim[valid_samples], cls_binary[valid_samples], bdist[valid_samples]
#Split into test/train
test_train_ratio = 0.9
avs_train, avs_test, cls_train, cls_test, bdist_train, bdist_test = train_test_split(avsim_valid, cls_valid, bdist_valid, test_size=test_train_ratio)
X = np.array([np.array(avs_train), np.array(cls_train)]).T
y = cls_train
X_test = np.array([np.array(avs_test), np.array(cls_test)]).T
cls_train.sum()/cls_train.size, cls_test.sum()/cls_test.size

(0.25595238095238093, 0.29155672823218998)

In [23]:
#LinReg
clf = LinearRegression()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1 #Threshold
#Prediction accuracy
print('Prediction accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediction accuracy: 1.0
Weights: [ -6.04994756e-19   1.00000000e+00]


In [24]:
#LogReg
clf = LogisticRegression()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1
#Prediction accuracy
print('Prediction accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediction accuracy: 1.0
Weights: [[-0.0199216   5.07084628]]


In [25]:
print('Prediction accuracy:', np.mean((cls_test == 0)))

Prediction accuracy: 0.708443271768


In [26]:
#Introduced

In [27]:
# Turning 'Introduced' to 1, and others to 0
cls_binary = (cls == 'Introduced')*1
avsim_valid, cls_valid, bdist_valid = avsim[valid_samples], cls_binary[valid_samples], bdist[valid_samples]
#Split into test/train
test_train_ratio = 0.9
avs_train, avs_test, cls_train, cls_test, bdist_train, bdist_test = train_test_split(avsim_valid, cls_valid, bdist_valid, test_size=test_train_ratio)
X = np.array([np.array(avs_train), np.array(cls_train)]).T
y = cls_train
X_test = np.array([np.array(avs_test), np.array(cls_test)]).T
cls_train.sum()/cls_train.size, cls_test.sum()/cls_test.size

(0.52380952380952384, 0.54815303430079154)

In [28]:
#LinReg
clf = LinearRegression()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1 #Threshold
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 1.0
Weights: [  2.14965583e-19   1.00000000e+00]


In [29]:
#LogReg
clf = LogisticRegression()
clf.fit(X, y)
prediction = (clf.predict(X_test) > 0.5)*1
#Prediction accuracy
print('Prediciton accuracy:', np.mean((prediction == np.array(cls_test))*1))
#Coefficients used by the classifier
print("Weights:", clf.coef_)

Prediciton accuracy: 1.0
Weights: [[-0.01576545  5.31332309]]


In [30]:
print('Prediciton accuracy:', np.mean((cls_test == 1)))

Prediciton accuracy: 0.548153034301


### In principle, one can train simple linear regression on 10% of data and get 100% accuracy. But note that we are only looking at results for ~1000 points. This is probably overfitting and we probably want a more principled approach that is robust to errors in sequencing and matching etc. Since the dataset already has information about those things, we should try to integrate it. Also, branch distance already includes a lot of preprocessing. Can we replicate its effectiveness, but with more robustness?

Spoiler: Performance decreases