#### Import Libraries and PyTorch

In [0]:
# http://pytorch.org/
from os import path
from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
platform = '{}{}-{}'.format(get_abbr_impl(), get_impl_ver(), get_abi_tag())

accelerator = 'cu80' if path.exists('/opt/bin/nvidia-smi') else 'cpu'

!pip install -q http://download.pytorch.org/whl/{accelerator}/torch-0.3.0.post4-{platform}-linux_x86_64.whl torchvision
import torch

In [0]:
import os, sys, time, datetime
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import sklearn
import pickle

#### PCA from download

In [6]:
# Download post-PCA data
!wget https://www.dropbox.com/s/pivxgxphb5jpqa0/data_pca_300comps.pkl
!wget https://www.dropbox.com/s/g3j68q988kj75x8/super_pop_int_numpy.pkl
!wget https://www.dropbox.com/s/ox1f9nslch6hiag/pops_with_ints_pandas.pkl
!wget https://www.dropbox.com/s/2er4yt97r8ys3ps/data_pca_1000comps.pkl

--2018-04-25 01:23:23--  https://www.dropbox.com/s/pivxgxphb5jpqa0/data_pca_300comps.pkl
Resolving www.dropbox.com (www.dropbox.com)... 162.125.7.1, 2620:100:6016:1::a27d:101
Connecting to www.dropbox.com (www.dropbox.com)|162.125.7.1|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://dl.dropboxusercontent.com/content_link/GejUQIv6CR6OFgTgx6PNqo3u8Q3UbYOc2yLiKBLoACJDjJdF1I8vCkJbwCwwiHR3/file [following]
--2018-04-25 01:23:23--  https://dl.dropboxusercontent.com/content_link/GejUQIv6CR6OFgTgx6PNqo3u8Q3UbYOc2yLiKBLoACJDjJdF1I8vCkJbwCwwiHR3/file
Resolving dl.dropboxusercontent.com (dl.dropboxusercontent.com)... 162.125.1.6, 2620:100:601a:6::a27d:706
Connecting to dl.dropboxusercontent.com (dl.dropboxusercontent.com)|162.125.1.6|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8280162 (7.9M) [application/octet-stream]
Saving to: ‘data_pca_300comps.pkl’


2018-04-25 01:23:26 (4.54 MB/s) - ‘data_pca_300comps.pkl’ saved [8280162/828


2018-04-25 01:23:36 (8.76 MB/s) - ‘data_pca_1000comps.pkl’ saved [27600162/27600162]



In [12]:
import pickle

# Load PCA with n=300
data_new_300 = pickle.load(open('data_pca_300comps.pkl','rb'))
print('Data with n=300 shape: ', data_new_300.shape)

# Load PCA with n=1000
data_new = pickle.load(open('data_pca_1000comps.pkl','rb'))
print('Data with n=1000 shape: ', data_new.shape)

# Load labels (super-populations)
spop_ints = pickle.load(open('super_pop_int_numpy.pkl','rb'))
print('Super pop labels shape: ', spop_ints.shape, spop_ints) # y

# Load labels (populations)
pop_ints = pickle.load(open('pops_with_ints_pandas.pkl','rb'))
pop_ints = pop_ints['pop int'].values
print('Pop labels shape: ', pop_ints.shape, pop_ints) # y

# Naming
X = data_new
y = pop_ints

Data with n=300 shape:  (3450, 300)
Data with n=1000 shape:  (3450, 1000)
Super pop labels shape:  (3450,) [3 3 3 ... 4 4 4]
Pop labels shape:  (3450,) [ 8  8  8 ... 21 21 21]


In [0]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

lr = LogisticRegression()
rf = RandomForestClassifier(n_estimators=100)
gb = GradientBoostingClassifier(n_estimators=100)

In [34]:
cross_val = cross_val_score(lr, X, y, scoring='accuracy', cv = 3)
accuracy = cross_val.mean() * 100
print('LR Accuracy: {:.2f}'.format(accuracy))

cross_val = cross_val_score(rf, X, y, scoring='accuracy', cv = 3)
accuracy = cross_val.mean() * 100
print('RF Accuracy: {:.2f}'.format(accuracy))

cross_val = cross_val_score(gb, X, y, scoring='accuracy', cv = 3)
accuracy = cross_val.mean() * 100
print('GB Accuracy: {:.2f}'.format(accuracy))

LR Accuracy: 76.84
RF Accuracy: 94.26
GB Accuracy: 92.78


In [49]:
for n in range(10,200,20):
  rf_n = RandomForestClassifier(n_estimators=n)
  cross_val = cross_val_score(rf_n, X, y, scoring='accuracy', cv = 3)
  accuracy = cross_val.mean() * 100
  print('RF {} Accuracy: {:.2f}'.format(n, accuracy))

RF 10 Accuracy: 81.45
RF 30 Accuracy: 90.23
RF 50 Accuracy: 92.47
RF 70 Accuracy: 92.82
RF 90 Accuracy: 93.02
RF 110 Accuracy: 93.37
RF 130 Accuracy: 93.57
RF 150 Accuracy: 93.31
RF 170 Accuracy: 94.33
RF 190 Accuracy: 93.86


In [51]:
for n in range(170,300,10):
  rf_n = RandomForestClassifier(n_estimators=n)
  cross_val = cross_val_score(rf_n, X, y, scoring='accuracy', cv = 3)
  accuracy = cross_val.mean() * 100
  print('RF {} Accuracy: {:.2f}'.format(n, accuracy))

RF 170 Accuracy: 94.35
RF 180 Accuracy: 94.26
RF 190 Accuracy: 93.60
RF 200 Accuracy: 93.92
RF 210 Accuracy: 94.27
RF 220 Accuracy: 94.01
RF 230 Accuracy: 93.86
RF 240 Accuracy: 93.74
RF 250 Accuracy: 94.26
RF 260 Accuracy: 94.12
RF 270 Accuracy: 93.95
RF 280 Accuracy: 94.12
RF 290 Accuracy: 94.41


In [53]:
n=1000
rf_n = RandomForestClassifier(n_estimators=n)
cross_val = cross_val_score(rf_n, X, y, scoring='accuracy', cv = 3)
accuracy = cross_val.mean() * 100
print('RF {} Accuracy: {:.2f}'.format(n, accuracy))

RF 1000 Accuracy: 94.58
