In [25]:
import sys
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

# Everything has been split roughly into 80:20 training:testing from the datasets
####################################################################################################
# mammogram vectorization 
# 961 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant
df = pd.read_csv('datasets/mammograms/mammographic_masses.data')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

mammograms = df.values

mammogram_features = mammograms[:,:-1]
mammogram_labels = mammograms[:,-1]

m_train, m_test, m_train_labels, m_test_labels = train_test_split(mammogram_features, mammogram_labels, test_size=.2, random_state=23)
####################################################################################################
# wisconsin breast cancer vectorization
# 669 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant

df1 = pd.read_csv('datasets/wisconsin_breast_cancer/breast-cancer-wisconsin.data')
df1 = df1.apply(pd.to_numeric, errors='coerce')
df1 = df1.fillna(0)
wbc = df1.values

wbc_features = wbc[:,1:-1]
wbc_labels = wbc[:,-1]

wbc_labels /= 2
wbc_labels -= 1

wbc_train, wbc_test, wbc_train_labels, wbc_test_labels = train_test_split(wbc_features, wbc_labels, test_size=.2, random_state=1)
####################################################################################################
# banknote authentication set
# 1372 samples
# label 0 for real, 1 for forgery

df2 = pd.read_csv('datasets/banknote/data_banknote_authentication.txt')
df2 = df2.apply(pd.to_numeric, errors='coerce')
bn = df2.values

bn_features = bn[:,:-1]
bn_labels = bn[:,-1]
bn_train, bn_test, bn_train_labels, bn_test_labels = train_test_split(bn_features, bn_labels, test_size=.2, random_state=10)

####################################################################################################
# covtype.binary
# 581,012 samples

cov_features, cov_labels = load_svmlight_file('datasets/covtype_binary/covtype.libsvm.binary')

cov_labels = cov_labels - 1

cov_train, cov_test, cov_train_labels, cov_test_labels = train_test_split(cov_features, cov_labels, test_size=.2, random_state=4)
####################################################################################################
# spam_sms vectorization
# 

#df2 = pd.read_csv('datasets/spam_sms/SMSSpamCollection')

#sms_raw = pd.read_csv("datasets/spam_sms/SMSSpamCollection",)

####################################################################################################
# AUS rain prediction vectorization
# ACHTUNG: might need more preprocessing because of text values in some

#df1 = pd.read_csv("datasets/AUS_rain/weatherAUS.csv")
#df_features = df1.drop(['RISK_MM','RainTomorrow'], axis=1)
#df_labels = df1[['RainTomorrow']]

#aus_features = df_features.values
#aus_labels = df_labels.values

In [44]:
from scipy.special import expit as sigmoid

num_samples, num_features = m_train.shape        

gd_w = np.random.uniform(size=num_features)

gd_loss_history = []

def gradient_descent_square_error(X,y,epochs=50000,lr=0.001):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        predictions = sigmoid(np.dot(X,gd_w))
        loss=np.sum((predictions-y)**2)
        gd_loss_history.append(loss)
    
        print("[INFO] epoch #{}, loss={:.7f}".format(i + 1, loss))
        
        gradient=np.dot(X.T,predictions-y) / num_samples
        gd_w+= -gradient * lr

def log_loss(X,y):
    global gd_w
    s = np.dot(X,gd_w)
    predictions=sigmoid(s)
    
    log_l = (-y*np.log(predictions)-(1-y)*np.log(1-predictions)).mean()
    return log_l
    
    
def gradient_descent_logistic_reg(X,y,epochs=5000,lr=0.01):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        s = np.dot(X,gd_w)
        predictions = sigmoid(s)
        error = predictions-y
        gradient=np.dot(X.T,error)/num_samples
        
        gd_w += -gradient * lr
        if i % 100 == 0:
            print "Loss on epoch %d: %f"%(i,log_loss(X,y))
#             print gd_w
    
    
def check_accuracy(X,y):
    num_samples, num_features = X.shape
    accurate = 0
    global gd_w
    
    for i in xrange(num_samples):
        prediction = sigmoid(np.dot(X[i],gd_w))
        label = 0
        
        if prediction >= .5:
            label = 1
        
       # print label
        #print y[i]
        if label == y[i]:
            accurate += 1
    print accurate
    print num_samples
    return accurate / float(num_samples)

# wipe gd_w
gradient_descent_logistic_reg(m_train, m_train_labels,40000,0.0001)
print (check_accuracy(m_test, m_test_labels))

num_samples, num_features = wbc_train.shape        

gd_w = np.random.uniform(size=num_features)
gradient_descent_logistic_reg(wbc_train, wbc_train_labels)
print (check_accuracy(wbc_test, wbc_test_labels))

# num_samples, num_features = cov_train.shape        

# gd_w = np.random.uniform(size=num_features)
# gradient_descent_logistic_reg(cov_train.toarray(), cov_train_labels,1000,0.1)
# print (check_accuracy(cov_test.toarray(), cov_test_labels))



Loss on epoch 0: nan
Loss on epoch 100: nan
Loss on epoch 200: 8.622407
Loss on epoch 300: 1.489364
Loss on epoch 400: 0.773710
Loss on epoch 500: 0.772283
Loss on epoch 600: 0.770879
Loss on epoch 700: 0.769480
Loss on epoch 800: 0.768088
Loss on epoch 900: 0.766702
Loss on epoch 1000: 0.765322
Loss on epoch 1100: 0.763949
Loss on epoch 1200: 0.762582
Loss on epoch 1300: 0.761221
Loss on epoch 1400: 0.759866
Loss on epoch 1500: 0.758518
Loss on epoch 1600: 0.757176
Loss on epoch 1700: 0.755841
Loss on epoch 1800: 0.754511
Loss on epoch 1900: 0.753188
Loss on epoch 2000: 0.751871
Loss on epoch 2100: 0.750560
Loss on epoch 2200: 0.749256
Loss on epoch 2300: 0.747958
Loss on epoch 2400: 0.746666
Loss on epoch 2500: 0.745380
Loss on epoch 2600: 0.744101
Loss on epoch 2700: 0.742828
Loss on epoch 2800: 0.741561
Loss on epoch 2900: 0.740300
Loss on epoch 3000: 0.739045
Loss on epoch 3100: 0.737797
Loss on epoch 3200: 0.736555
Loss on epoch 3300: 0.735318
Loss on epoch 3400: 0.734089
Loss on

Loss on epoch 30300: 0.582804
Loss on epoch 30400: 0.582652
Loss on epoch 30500: 0.582502
Loss on epoch 30600: 0.582353
Loss on epoch 30700: 0.582204
Loss on epoch 30800: 0.582057
Loss on epoch 30900: 0.581910
Loss on epoch 31000: 0.581764
Loss on epoch 31100: 0.581620
Loss on epoch 31200: 0.581476
Loss on epoch 31300: 0.581333
Loss on epoch 31400: 0.581191
Loss on epoch 31500: 0.581050
Loss on epoch 31600: 0.580910
Loss on epoch 31700: 0.580770
Loss on epoch 31800: 0.580632
Loss on epoch 31900: 0.580494
Loss on epoch 32000: 0.580358
Loss on epoch 32100: 0.580222
Loss on epoch 32200: 0.580087
Loss on epoch 32300: 0.579953
Loss on epoch 32400: 0.579819
Loss on epoch 32500: 0.579687
Loss on epoch 32600: 0.579555
Loss on epoch 32700: 0.579424
Loss on epoch 32800: 0.579294
Loss on epoch 32900: 0.579165
Loss on epoch 33000: 0.579036
Loss on epoch 33100: 0.578909
Loss on epoch 33200: 0.578782
Loss on epoch 33300: 0.578656
Loss on epoch 33400: 0.578530
Loss on epoch 33500: 0.578406
Loss on ep