In [1]:
import sys
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

# Everything has been split roughly into 80:20 training:testing from the datasets
####################################################################################################
# mammogram vectorization 
# 961 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant
df = pd.read_csv('datasets/mammograms/mammographic_masses.data')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

mammograms = df.values

mammogram_features = mammograms[:,:-1]
mammogram_labels = mammograms[:,-1]

m_train, m_test, m_train_labels, m_test_labels = train_test_split(mammogram_features, mammogram_labels, test_size=.2, random_state=23)
####################################################################################################
# wisconsin breast cancer vectorization
# 669 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant

df1 = pd.read_csv('datasets/wisconsin_breast_cancer/breast-cancer-wisconsin.data')
df1 = df1.apply(pd.to_numeric, errors='coerce')
df1 = df1.fillna(0)
wbc = df1.values

wbc_features = wbc[:,1:-1]
wbc_labels = wbc[:,-1]

wbc_labels /= 2
wbc_labels -= 1

wbc_train, wbc_test, wbc_train_labels, wbc_test_labels = train_test_split(wbc_features, wbc_labels, test_size=.2, random_state=1)

####################################################################################################
# banknote authentication set
# 1372 samples
# label 0 for real, 1 for forgery

df2 = pd.read_csv('datasets/banknote/data_banknote_authentication.txt')
df2 = df2.apply(pd.to_numeric, errors='coerce')
bn = df2.values

bn_features = bn[:,:-1]
bn_labels = bn[:,-1]
bn_train, bn_test, bn_train_labels, bn_test_labels = train_test_split(bn_features, bn_labels, test_size=.2, random_state=10)

####################################################################################################
# covtype.binary
# 581,012 samples

cov_features, cov_labels = load_svmlight_file('datasets/covtype_binary/covtype.libsvm.binary')

cov_labels = cov_labels - 1

cov_train, cov_test, cov_train_labels, cov_test_labels = train_test_split(cov_features, cov_labels, test_size=.2, random_state=4)
print cov_train_labels

####################################################################################################
# spam_sms vectorization
# 

#df2 = pd.read_csv('datasets/spam_sms/SMSSpamCollection')

#sms_raw = pd.read_csv("datasets/spam_sms/SMSSpamCollection",)

####################################################################################################
# AUS rain prediction vectorization
# ACHTUNG: might need more preprocessing because of text values in some

#df1 = pd.read_csv("datasets/AUS_rain/weatherAUS.csv")
#df_features = df1.drop(['RISK_MM','RainTomorrow'], axis=1)
#df_labels = df1[['RainTomorrow']]

#aus_features = df_features.values
#aus_labels = df_labels.values

[0. 0. 1. ... 1. 1. 1.]


In [5]:
from scipy.special import expit as sigmoid

num_samples, num_features = m_train.shape        

gd_w = np.random.uniform(size=num_features)

gd_loss_history = []

def gradient_descent(X,y,epochs=1000,lr=0.01):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        predictions = sigmoid(np.dot(X,gd_w))

        loss=np.sum((predictions-y)**2)
        gd_loss_history.append(loss)
    
        print("[INFO] epoch #{}, loss={:.7f}".format(i + 1, loss))
        
        gradient=np.dot(X.T,predictions-y) / num_samples
        
        global gd_w 
        gd_w+= -gradient * lr
        
gradient_descent(m_train, m_train_labels)

  global gd_w


[INFO] epoch #1, loss=417.0059397
[INFO] epoch #2, loss=416.6688562
[INFO] epoch #3, loss=302.8428358
[INFO] epoch #4, loss=416.2938248
[INFO] epoch #5, loss=333.7378662
[INFO] epoch #6, loss=416.3101724
[INFO] epoch #7, loss=332.9007358
[INFO] epoch #8, loss=416.3138203
[INFO] epoch #9, loss=332.4525376
[INFO] epoch #10, loss=416.3117029
[INFO] epoch #11, loss=332.1956988
[INFO] epoch #12, loss=416.3070544
[INFO] epoch #13, loss=332.0243977
[INFO] epoch #14, loss=416.3012845
[INFO] epoch #15, loss=331.8899858
[INFO] epoch #16, loss=416.2949877
[INFO] epoch #17, loss=331.7715293
[INFO] epoch #18, loss=416.2884103
[INFO] epoch #19, loss=331.6602719
[INFO] epoch #20, loss=416.2816533
[INFO] epoch #21, loss=331.5525971
[INFO] epoch #22, loss=416.2747570
[INFO] epoch #23, loss=331.4470222
[INFO] epoch #24, loss=416.2677369
[INFO] epoch #25, loss=331.3429438
[INFO] epoch #26, loss=416.2605981
[INFO] epoch #27, loss=331.2401198
[INFO] epoch #28, loss=416.2533412
[INFO] epoch #29, loss=331.13

[INFO] epoch #408, loss=404.6996479
[INFO] epoch #409, loss=323.5594087
[INFO] epoch #410, loss=404.4990853
[INFO] epoch #411, loss=323.4992957
[INFO] epoch #412, loss=404.2957759
[INFO] epoch #413, loss=323.4380967
[INFO] epoch #414, loss=404.0896948
[INFO] epoch #415, loss=323.3758038
[INFO] epoch #416, loss=403.8808174
[INFO] epoch #417, loss=323.3124095
[INFO] epoch #418, loss=403.6691198
[INFO] epoch #419, loss=323.2479059
[INFO] epoch #420, loss=403.4545788
[INFO] epoch #421, loss=323.1822852
[INFO] epoch #422, loss=403.2371722
[INFO] epoch #423, loss=323.1155394
[INFO] epoch #424, loss=403.0168784
[INFO] epoch #425, loss=323.0476605
[INFO] epoch #426, loss=402.7936771
[INFO] epoch #427, loss=322.9786407
[INFO] epoch #428, loss=402.5675488
[INFO] epoch #429, loss=322.9084718
[INFO] epoch #430, loss=402.3384753
[INFO] epoch #431, loss=322.8371459
[INFO] epoch #432, loss=402.1064395
[INFO] epoch #433, loss=322.7646549
[INFO] epoch #434, loss=401.8714255
[INFO] epoch #435, loss=322.

[INFO] epoch #824, loss=300.5183021
[INFO] epoch #825, loss=280.4106959
[INFO] epoch #826, loss=299.7066674
[INFO] epoch #827, loss=280.0114283
[INFO] epoch #828, loss=298.8930545
[INFO] epoch #829, loss=279.6101263
[INFO] epoch #830, loss=298.0775350
[INFO] epoch #831, loss=279.2068148
[INFO] epoch #832, loss=297.2601836
[INFO] epoch #833, loss=278.8015203
[INFO] epoch #834, loss=296.4410783
[INFO] epoch #835, loss=278.3942715
[INFO] epoch #836, loss=295.6203006
[INFO] epoch #837, loss=277.9850993
[INFO] epoch #838, loss=294.7979357
[INFO] epoch #839, loss=277.5740367
[INFO] epoch #840, loss=293.9740728
[INFO] epoch #841, loss=277.1611191
[INFO] epoch #842, loss=293.1488052
[INFO] epoch #843, loss=276.7463845
[INFO] epoch #844, loss=292.3222306
[INFO] epoch #845, loss=276.3298736
[INFO] epoch #846, loss=291.4944514
[INFO] epoch #847, loss=275.9116298
[INFO] epoch #848, loss=290.6655751
[INFO] epoch #849, loss=275.4916997
[INFO] epoch #850, loss=289.8357144
[INFO] epoch #851, loss=275.