In [2]:
import sys
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

# Everything has been split roughly into 80:20 training:testing from the datasets
####################################################################################################
# mammogram vectorization 
# 961 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant
df = pd.read_csv('datasets/mammograms/mammographic_masses.data')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

mammograms = df.values

mammogram_features = mammograms[:,:-1]
mammogram_labels = mammograms[:,-1]

m_train, m_test, m_train_labels, m_test_labels = train_test_split(mammogram_features, mammogram_labels, test_size=.2, random_state=23)
####################################################################################################
# wisconsin breast cancer vectorization
# 669 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant

df1 = pd.read_csv('datasets/wisconsin_breast_cancer/breast-cancer-wisconsin.data')
df1 = df1.apply(pd.to_numeric, errors='coerce')
df1 = df1.fillna(0)
wbc = df1.values

wbc_features = wbc[:,1:-1]
wbc_labels = wbc[:,-1]

wbc_labels /= 2
wbc_labels -= 1

wbc_train, wbc_test, wbc_train_labels, wbc_test_labels = train_test_split(wbc_features, wbc_labels, test_size=.2, random_state=1)
####################################################################################################
# banknote authentication set
# 1372 samples
# label 0 for real, 1 for forgery

df2 = pd.read_csv('datasets/banknote/data_banknote_authentication.txt')
df2 = df2.apply(pd.to_numeric, errors='coerce')
bn = df2.values

bn_features = bn[:,:-1]
bn_labels = bn[:,-1]
bn_train, bn_test, bn_train_labels, bn_test_labels = train_test_split(bn_features, bn_labels, test_size=.2, random_state=10)

####################################################################################################
# covtype.binary
# 581,012 samples

cov_features, cov_labels = load_svmlight_file('datasets/covtype_binary/covtype.libsvm.binary')

cov_labels = cov_labels - 1

cov_train, cov_test, cov_train_labels, cov_test_labels = train_test_split(cov_features, cov_labels, test_size=.2, random_state=4)
####################################################################################################
# spam_sms vectorization
# 

#df2 = pd.read_csv('datasets/spam_sms/SMSSpamCollection')

#sms_raw = pd.read_csv("datasets/spam_sms/SMSSpamCollection",)

####################################################################################################
# AUS rain prediction vectorization
# ACHTUNG: might need more preprocessing because of text values in some

#df1 = pd.read_csv("datasets/AUS_rain/weatherAUS.csv")
#df_features = df1.drop(['RISK_MM','RainTomorrow'], axis=1)
#df_labels = df1[['RainTomorrow']]

#aus_features = df_features.values
#aus_labels = df_labels.values

[1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1.
 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1.
 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0.

In [None]:
from scipy.special import expit as sigmoid

num_samples, num_features = m_train.shape        

gd_w = np.random.uniform(size=num_features)

gd_loss_history = []

def gradient_descent_square_error(X,y,epochs=50000,lr=0.001):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        predictions = sigmoid(np.dot(X,gd_w))
        loss=np.sum((predictions-y)**2)
        gd_loss_history.append(loss)
    
        print("[INFO] epoch #{}, loss={:.7f}".format(i + 1, loss))
        
        gradient=np.dot(X.T,predictions-y) / num_samples
        gd_w+= -gradient * lr

def log_loss(X,y):
    global gd_w
    s = np.dot(X,gd_w)
    predictions=sigmoid(s)
    
    log_l = (-y*np.log(predictions)-(1-y)*np.log(1-predictions)).mean()
    return log_l
    
    
def gradient_descent_logistic_reg(X,y,epochs=50000,lr=0.01):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        s = np.dot(X,gd_w)
        predictions = sigmoid(s)
        error = predictions-y
        gradient=np.dot(X.T,error)/num_samples
        
        gd_w += -gradient * lr
        if i % 1000 == 0:
            print "Loss on epoch %d: %f"%(i,log_loss(X,y))
#             print gd_w
    
    
def check_accuracy(X,y):
    num_samples, num_features = X.shape
    accurate = 0
    global gd_w
    
    for i in xrange(num_samples):
        prediction = sigmoid(np.dot(X[i],gd_w))
        label = 0
        
        if prediction >= .5:
            label = 1
        
       # print label
        #print y[i]
        if label == y[i]:
            accurate += 1
    print accurate
    print num_samples
    return accurate / float(num_samples)

# wipe gd_w
gradient_descent_logistic_reg(m_train, m_train_labels)
print (check_accuracy(m_test, m_test_labels))

num_samples, num_features = wbc_train.shape        

gd_w = np.random.uniform(size=num_features)
gradient_descent_logistic_reg(wbc_train, wbc_train_labels)
print (check_accuracy(wbc_test, wbc_test_labels))

# num_samples, num_features = cov_train.shape        

# gd_w = np.random.uniform(size=num_features)
# gradient_descent_logistic_reg(cov_train.toarray(), cov_train_labels,1000,0.1)
# print (check_accuracy(cov_test.toarray(), cov_test_labels))



Loss on epoch 0: nan
Loss on epoch 1000: 1.818712
Loss on epoch 2000: 1.610095
Loss on epoch 3000: 1.316671
Loss on epoch 4000: 1.908193
Loss on epoch 5000: 1.118550
Loss on epoch 6000: 1.116445
Loss on epoch 7000: 1.168656
Loss on epoch 8000: 1.832944
