In [1]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('responses.csv')
print(df.shape)

(1010, 150)


In [3]:
X = df.as_matrix()
X = X[:,133:140]
X = X.astype(float)
print(X.shape,X.dtype)

(1010, 7) float64


In [4]:
labels = df.Education.as_matrix()
print(labels.dtype)

object


In [5]:
#Delete rows that have empty cells
empty = np.zeros(1010,dtype=bool)
for i in range(1010):
    for j in range(7):
        if np.isnan(X[i][j]):
            empty[i] = True
empty[831] = True #831 does not have education level. 
X = X[~empty]
labels = labels[~empty]
print(X.shape,labels.shape)

(994, 7) (994,)


In [6]:
label = np.zeros(994)
for i in range(994):
    if labels[i] == "currently a primary school pupil":
        label[i] = 0
    if labels[i] == "primary school":
        label[i] = 1
    if labels[i] == "secondary school":
        label[i] = 2
    if labels[i] == "college/bachelor degree":
        label[i] = 3
    if labels[i] == "masters degree":
        label[i] = 4

In [7]:
# split the data into training, validation, and testing set. 
train_spending = X[:800]
train_labels = label[:800]
validation_spending = X[800:900]
validation_labels = label[800:900]
test_spending = X[900:]
test_labels = label[900:]

In [8]:
import torch
from torch import autograd
import torch.nn.functional as F
from torch.autograd import Variable

import random

In [9]:
def train(batch_size):
    # model.train() puts our model in train mode, which can require different
    # behavior than eval mode (for example in the case of dropout).
    model.train()
    # i is is a 1-D array with shape [batch_size]
    i = np.random.choice(train_spending.shape[0], size=batch_size, replace=False)
    x = autograd.Variable(torch.from_numpy(train_spending[i].astype(np.float32)))
    y = autograd.Variable(torch.from_numpy(train_labels[i].astype(np.int)))
    optimizer.zero_grad()
    y_hat_ = model(x)
    loss = F.cross_entropy(y_hat_, y)
    loss.backward()
    optimizer.step()
    return loss.data[0]

In [10]:
# Define an accuracy function
def accuracy(y, y_hat):
   """Compute accuracy.
   Args:
       y: A 1-D int NumPy array.
       y_hat: A 1-D int NumPy array.
   Returns:
       A float, the fraction of time y[i] == y_hat[i].
   """
   return (y == y_hat).astype(np.float).mean()

In [11]:
def approx_train_accuracy():
    model.train()
    index = list(range(800))
    random.shuffle(index)
    index = index[:100]
    
    X = train_spending[index]
    y = train_labels[index]
    y_raw = torch.from_numpy(np.zeros((100,5)))
    
    for i in range(100):
        X_ = Variable(torch.from_numpy(X[i]).type(torch.FloatTensor))
        y_raw[i] = model(X_).data
        
    _,y_pred = torch.max(y_raw,1)
    return accuracy(y,y_pred.numpy())

In [12]:
def val_accuracy():
    model.eval()
    y_raw = torch.from_numpy(np.zeros((100,5)))
    for i in range(100):
        X_ = Variable(torch.from_numpy(validation_spending[i]).type(torch.FloatTensor))
        y_raw[i] = model(X_).data
    _,y_pred = torch.max(y_raw,1)
    return accuracy(validation_labels,y_pred.numpy())

In [13]:
class TwoLayerNN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.linear1 = torch.nn.Linear(7, 20)
        self.linear2 = torch.nn.Linear(20,10)
        self.linear3 = torch.nn.Linear(10,5)
    def forward(self,x):
        x = self.linear3(F.sigmoid(self.linear2(F.sigmoid(self.linear1(x)))))
        return x

In [14]:
# Create model
model = TwoLayerNN()
# Create optimizer
optimizer = torch.optim.Adam(model.parameters(), lr = 1e-3)

train_accs, val_accs = [], []
index = []
for i in range(1000):
    train(50)
    if i % 50 == 0:
        train_accs.append(approx_train_accuracy())
        val_accs.append(val_accuracy())
        index.append(i)
        print("%6d %5.2f %5.2f" % (i, train_accs[-1], val_accs[-1]))

     0  0.04  0.03
    50  0.56  0.65
   100  0.61  0.65
   150  0.62  0.65
   200  0.62  0.65
   250  0.57  0.65
   300  0.55  0.65
   350  0.60  0.65
   400  0.60  0.65
   450  0.54  0.65
   500  0.64  0.65
   550  0.55  0.65
   600  0.63  0.65
   650  0.52  0.65
   700  0.66  0.65
   750  0.66  0.65
   800  0.54  0.65
   850  0.66  0.65
   900  0.62  0.65
   950  0.62  0.65


In [15]:
from sklearn.decomposition import PCA
pca = PCA(n_components=5,whiten=True)
b = pca.fit_transform(train_spending)
v = pca.fit_transform(validation_spending)

In [16]:
# K nearest neighbor
from sklearn import neighbors

clf = neighbors.KNeighborsClassifier(25)
y_pred = clf.fit(b,train_labels).predict(v)

print(accuracy(validation_labels,y_pred))

0.65


In [17]:
#Decision Tree
from sklearn import tree
from sklearn.model_selection import cross_val_score

for depth in range(1,5):
    clf = tree.DecisionTreeClassifier(max_depth=depth)
    s = cross_val_score(clf, X,label, cv=10) # apples to apples?
    print (depth, s.mean(), s.std(), s.min())


1 0.61676713293 0.00573236791368 0.61
2 0.61676713293 0.00573236791368 0.61
3 0.611695388769 0.00612547770898 0.6
4 0.601653978505 0.0136160189526 0.565656565657


In [18]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10,max_depth=3)
cross_val_score(clf, X,label, cv=10)

array([ 0.61386139,  0.61386139,  0.61386139,  0.61      ,  0.61      ,
        0.61616162,  0.61616162,  0.62244898,  0.62244898,  0.62886598])

In [19]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

y_pred = gnb.fit(train_spending, train_labels).predict(validation_spending)

print(accuracy(validation_labels,y_pred))

0.64


In [20]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()
clf.fit(train_spending,train_labels)
y_pred = clf.predict(validation_spending)

print(accuracy(validation_labels,y_pred))

0.65


In [21]:
# SVM
from sklearn.svm import SVC

clf = SVC(kernel='linear') #Sigmoid also gives the same accuracy
clf.fit(train_spending,train_labels)
y_pred = clf.predict(validation_spending)

print(accuracy(validation_labels,y_pred))

0.65
