In [1]:
import sys
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

# Everything has been split roughly into 80:20 training:testing from the datasets
####################################################################################################
# mammogram vectorization 
# 961 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant
df = pd.read_csv('datasets/mammograms/mammographic_masses.data')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

mammograms = df.values

mammogram_features = mammograms[:,:-1]
mammogram_labels = mammograms[:,-1]

m_train, m_test, m_train_labels, m_test_labels = train_test_split(mammogram_features, mammogram_labels, test_size=.2, random_state=23)
####################################################################################################
# wisconsin breast cancer vectorization
# 669 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant

df1 = pd.read_csv('datasets/wisconsin_breast_cancer/breast-cancer-wisconsin.data')
df1 = df1.apply(pd.to_numeric, errors='coerce')
df1 = df1.fillna(0)
wbc = df1.values

wbc_features = wbc[:,1:-1]
wbc_labels = wbc[:,-1]

wbc_labels /= 2
wbc_labels -= 1

wbc_train, wbc_test, wbc_train_labels, wbc_test_labels = train_test_split(wbc_features, wbc_labels, test_size=.2, random_state=1)
print wbc_train_labels
####################################################################################################
# banknote authentication set
# 1372 samples
# label 0 for real, 1 for forgery

df2 = pd.read_csv('datasets/banknote/data_banknote_authentication.txt')
df2 = df2.apply(pd.to_numeric, errors='coerce')
bn = df2.values

bn_features = bn[:,:-1]
bn_labels = bn[:,-1]
bn_train, bn_test, bn_train_labels, bn_test_labels = train_test_split(bn_features, bn_labels, test_size=.2, random_state=10)

####################################################################################################
# covtype.binary
# 581,012 samples

cov_features, cov_labels = load_svmlight_file('datasets/covtype_binary/covtype.libsvm.binary')

cov_labels = cov_labels - 1

cov_train, cov_test, cov_train_labels, cov_test_labels = train_test_split(cov_features, cov_labels, test_size=.2, random_state=4)
####################################################################################################
# spam_sms vectorization
# 

#df2 = pd.read_csv('datasets/spam_sms/SMSSpamCollection')

#sms_raw = pd.read_csv("datasets/spam_sms/SMSSpamCollection",)

####################################################################################################
# AUS rain prediction vectorization
# ACHTUNG: might need more preprocessing because of text values in some

#df1 = pd.read_csv("datasets/AUS_rain/weatherAUS.csv")
#df_features = df1.drop(['RISK_MM','RainTomorrow'], axis=1)
#df_labels = df1[['RainTomorrow']]

#aus_features = df_features.values
#aus_labels = df_labels.values

[1. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 1. 0.
 0. 1. 0. 0. 0. 0. 1. 1. 1. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 1.
 0. 1. 0. 1. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1. 1. 1. 1.
 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 1. 0. 0. 0. 1. 1. 0. 0. 0. 1. 0. 0. 0. 1.
 0. 1. 0. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 0. 0. 0. 1. 0. 1.
 1. 0. 0. 0. 1. 0. 1. 0. 1. 1. 0. 0. 1. 0. 0. 1. 0. 0. 1. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 1. 1. 0. 0. 0. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 1.
 1. 0. 0. 1. 1. 0. 1. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 0.
 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 1. 0. 1. 1. 0. 0. 0. 0. 0. 0. 1. 0.
 1. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 1. 0. 0. 1. 1. 1. 1. 0. 1. 0. 0. 1. 1.
 0. 0. 0. 0. 0. 1. 0. 0. 1. 0. 0. 1. 1. 0. 1. 0. 0.

In [3]:
from scipy.special import expit as sigmoid

num_samples, num_features = m_train.shape        

gd_w = np.random.uniform(size=num_features)

gd_loss_history = []

def gradient_descent(X,y,epochs=50000,lr=0.001):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        predictions = sigmoid(np.dot(X,gd_w))
        loss=np.sum((predictions-y)**2)
        gd_loss_history.append(loss)
    
        print("[INFO] epoch #{}, loss={:.7f}".format(i + 1, loss))
        
        gradient=np.dot(X.T,predictions-y) / num_samples
        gd_w+= -gradient * lr
        
def check_accuracy(X,y):
    num_samples, num_features = X.shape
    accurate = 0
    global gd_w
    
    for i in xrange(num_samples):
        prediction = sigmoid(np.dot(X[i],gd_w))
        label = 0
        
        if prediction >= .5:
            label = 1
        
       # print label
        #print y[i]
        if label == y[i]:
            accurate += 1
    print accurate
    print num_samples
    return accurate / float(num_samples)

# wipe gd_w
#gradient_descent(m_train, m_train_labels)
#print (check_accuracy(m_test, m_test_labels))

# num_samples, num_features = wbc_train.shape        

# gd_w = np.random.uniform(size=num_features)
# gradient_descent(wbc_train, wbc_train_labels)
# print (check_accuracy(wbc_test, wbc_test_labels))

num_samples, num_features = cov_train.shape        

gd_w = np.random.uniform(size=num_features)
gradient_descent(cov_train.toarray(), cov_train_labels,1000,0.1)
print (check_accuracy(cov_test.toarray(), cov_test_labels))

[INFO] epoch #1, loss=238400.0000000
[INFO] epoch #2, loss=226409.0000000
[INFO] epoch #3, loss=204985.0497057
[INFO] epoch #4, loss=226409.0000000
[INFO] epoch #5, loss=238400.0000000
[INFO] epoch #6, loss=226409.0000000
[INFO] epoch #7, loss=238400.0000000
[INFO] epoch #8, loss=226409.0000000
[INFO] epoch #9, loss=238400.0000000
[INFO] epoch #10, loss=218275.4214128
[INFO] epoch #11, loss=238400.0000000
[INFO] epoch #12, loss=226409.0000000
[INFO] epoch #13, loss=238400.0000000
[INFO] epoch #14, loss=226191.9749180
[INFO] epoch #15, loss=238400.0000000
[INFO] epoch #16, loss=223063.6401384
[INFO] epoch #17, loss=238400.0000000
[INFO] epoch #18, loss=224128.9971166
[INFO] epoch #19, loss=238400.0000000
[INFO] epoch #20, loss=223654.0356143
[INFO] epoch #21, loss=238400.0000000
[INFO] epoch #22, loss=223910.9999876
[INFO] epoch #23, loss=238400.0000000
[INFO] epoch #24, loss=223813.0305822
[INFO] epoch #25, loss=238400.0000000
[INFO] epoch #26, loss=223851.9981950
[INFO] epoch #27, los

[INFO] epoch #217, loss=229773.9836906
[INFO] epoch #218, loss=219736.9427500
[INFO] epoch #219, loss=229768.0962631
[INFO] epoch #220, loss=219732.5660499
[INFO] epoch #221, loss=229765.8546310
[INFO] epoch #222, loss=219734.9849075
[INFO] epoch #223, loss=229771.8655227
[INFO] epoch #224, loss=219741.0607451
[INFO] epoch #225, loss=229761.1501471
[INFO] epoch #226, loss=219736.9617531
[INFO] epoch #227, loss=229757.6303806
[INFO] epoch #228, loss=219728.0028735
[INFO] epoch #229, loss=229751.2305694
[INFO] epoch #230, loss=219725.0074863
[INFO] epoch #231, loss=229745.8407936
[INFO] epoch #232, loss=219724.9998480
[INFO] epoch #233, loss=229747.9291768
[INFO] epoch #234, loss=219717.0040066
[INFO] epoch #235, loss=229740.9538826
[INFO] epoch #236, loss=219714.0056513
[INFO] epoch #237, loss=229745.0967158
[INFO] epoch #238, loss=219717.9659911
[INFO] epoch #239, loss=229736.3184283
[INFO] epoch #240, loss=219719.0000065
[INFO] epoch #241, loss=229740.4197129
[INFO] epoch #242, loss=2

[INFO] epoch #431, loss=229705.7735392
[INFO] epoch #432, loss=219557.3879872
[INFO] epoch #433, loss=229710.9501994
[INFO] epoch #434, loss=219555.2343127
[INFO] epoch #435, loss=229721.2931913
[INFO] epoch #436, loss=219562.0640315
[INFO] epoch #437, loss=229722.4363096
[INFO] epoch #438, loss=219549.9945788
[INFO] epoch #439, loss=229716.2226713
[INFO] epoch #440, loss=219555.0091148
[INFO] epoch #441, loss=229716.9072664
[INFO] epoch #442, loss=219560.9913050
[INFO] epoch #443, loss=229715.1496278
[INFO] epoch #444, loss=219553.0021735
[INFO] epoch #445, loss=229716.4677421
[INFO] epoch #446, loss=219548.4136617
[INFO] epoch #447, loss=229711.8138077
[INFO] epoch #448, loss=219544.9998958
[INFO] epoch #449, loss=229710.8351122
[INFO] epoch #450, loss=219543.0001407
[INFO] epoch #451, loss=229711.3492832
[INFO] epoch #452, loss=219544.9979300
[INFO] epoch #453, loss=229723.7807752
[INFO] epoch #454, loss=219551.0000000
[INFO] epoch #455, loss=229728.0306870
[INFO] epoch #456, loss=2

[INFO] epoch #643, loss=229871.9571012
[INFO] epoch #644, loss=219432.0000000
[INFO] epoch #645, loss=229876.9524211
[INFO] epoch #646, loss=219435.7281720
[INFO] epoch #647, loss=229877.2637913
[INFO] epoch #648, loss=219424.9986883
[INFO] epoch #649, loss=229884.6690991
[INFO] epoch #650, loss=219424.5828011
[INFO] epoch #651, loss=229877.5195516
[INFO] epoch #652, loss=219422.1347860
[INFO] epoch #653, loss=229867.7904683
[INFO] epoch #654, loss=219426.1079931
[INFO] epoch #655, loss=229876.9858389
[INFO] epoch #656, loss=219420.9914367
[INFO] epoch #657, loss=229866.9194481
[INFO] epoch #658, loss=219426.0577469
[INFO] epoch #659, loss=229867.4360736
[INFO] epoch #660, loss=219418.9999971
[INFO] epoch #661, loss=229869.8878659
[INFO] epoch #662, loss=219417.9261701
[INFO] epoch #663, loss=229870.5649696
[INFO] epoch #664, loss=219412.8416016
[INFO] epoch #665, loss=229871.7095266
[INFO] epoch #666, loss=219410.9925318
[INFO] epoch #667, loss=229866.0776388
[INFO] epoch #668, loss=2

[INFO] epoch #858, loss=219366.8255552
[INFO] epoch #859, loss=230039.9623220
[INFO] epoch #860, loss=219369.0400933
[INFO] epoch #861, loss=230036.3137478
[INFO] epoch #862, loss=219372.9949717
[INFO] epoch #863, loss=230037.3882589
[INFO] epoch #864, loss=219367.2186683
[INFO] epoch #865, loss=230040.5205638
[INFO] epoch #866, loss=219365.9999477
[INFO] epoch #867, loss=230040.7076195
[INFO] epoch #868, loss=219364.6448619
[INFO] epoch #869, loss=230042.5622233
[INFO] epoch #870, loss=219363.8163193
[INFO] epoch #871, loss=230043.3762480
[INFO] epoch #872, loss=219360.0018049
[INFO] epoch #873, loss=230044.9329835
[INFO] epoch #874, loss=219363.0000053
[INFO] epoch #875, loss=230042.9870937
[INFO] epoch #876, loss=219369.1074358
[INFO] epoch #877, loss=230044.1730879
[INFO] epoch #878, loss=219368.9999619
[INFO] epoch #879, loss=230045.6393682
[INFO] epoch #880, loss=219369.9999130
[INFO] epoch #881, loss=230044.4260363
[INFO] epoch #882, loss=219369.0000000
[INFO] epoch #883, loss=2