In [5]:
import sys
import numpy as np
import pandas as pd
import csv
import nltk
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_svmlight_file

# Everything has been split roughly into 80:20 training:testing from the datasets
####################################################################################################
# mammogram vectorization 
# 961 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant
df = pd.read_csv('datasets/mammograms/mammographic_masses.data')
df = df.apply(pd.to_numeric, errors='coerce')
df = df.fillna(0)

mammograms = df.values

mammogram_features = mammograms[:,:-1]
mammogram_labels = mammograms[:,-1]

m_train, m_test, m_train_labels, m_test_labels = train_test_split(mammogram_features, mammogram_labels, test_size=.2, random_state=23)
####################################################################################################
# wisconsin breast cancer vectorization
# 669 samples
# contains '?' values for undefined/missing
# label 0 for benign, 1 for malignant

df1 = pd.read_csv('datasets/wisconsin_breast_cancer/breast-cancer-wisconsin.data')
df1 = df1.apply(pd.to_numeric, errors='coerce')
df1 = df1.fillna(0)
wbc = df1.values

wbc_features = wbc[:,1:-1]
wbc_labels = wbc[:,-1]

wbc_labels /= 2
wbc_labels -= 1

wbc_train, wbc_test, wbc_train_labels, wbc_test_labels = train_test_split(wbc_features, wbc_labels, test_size=.2, random_state=1)
####################################################################################################
# banknote authentication set
# 1372 samples
# label 0 for real, 1 for forgery

df2 = pd.read_csv('datasets/banknote/data_banknote_authentication.txt')
df2 = df2.apply(pd.to_numeric, errors='coerce')
bn = df2.values

bn_features = bn[:,:-1]
bn_labels = bn[:,-1]
bn_train, bn_test, bn_train_labels, bn_test_labels = train_test_split(bn_features, bn_labels, test_size=.2, random_state=10)

####################################################################################################
# news20 test

news_features, news_labels = load_svmlight_file('datasets/news20/news20.binary')

news_labels += 1
news_labels /= 2

news_train, news_test, news_train_labels, news_test_labels = train_test_split(news_features, news_labels, test_size = .2, random_state=10)

####################################################################################################
# cod-rna
cod_features, cod_labels = load_svmlight_file('datasets/cod_rna/cod-rna')

cod_labels += 1
cod_labels /= 2

cod_train,cod_test, cod_train_labels, cod_test_labels = train_test_split(cod_features, cod_labels, test_size = .2, random_state=10)


In [12]:
from scipy.special import expit as sigmoid

num_samples, num_features = m_train.shape        

gd_w = np.random.uniform(size=num_features)

gd_loss_history = []

def gradient_descent_square_error(X,y,epochs=50000,lr=0.001):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        predictions = sigmoid(np.dot(X,gd_w))
        loss=np.sum((predictions-y)**2)
        gd_loss_history.append(loss)
    
        print("[INFO] epoch #{}, loss={:.7f}".format(i + 1, loss))
        
        gradient=np.dot(X.T,predictions-y) / num_samples
        gd_w+= -gradient * lr

def log_loss(X,y):
    global gd_w
    s = np.dot(X,gd_w)
    predictions=sigmoid(s)
    
    log_l = (-y*np.log(predictions)-(1-y)*np.log(1-predictions)).mean()
    return log_l
    
    
def gradient_descent_logistic_reg(X,y,epochs=5000,lr=0.01):
    num_samples, num_features = X.shape
    
    for i in xrange(epochs):
        global gd_w
        s = np.dot(X,gd_w)
        predictions = sigmoid(s)
        error = predictions-y
        gradient=np.dot(X.T,error)/num_samples
        
        gd_w += -gradient * lr
        if i % 100 == 0:
            print "Loss on epoch %d: %f"%(i,log_loss(X,y))
#             print gd_w
    
    
def check_accuracy(X,y):
    num_samples, num_features = X.shape
    accurate = 0
    global gd_w
    
    for i in xrange(num_samples):
        prediction = sigmoid(np.dot(X[i],gd_w))
        label = 0
        
        if prediction >= .5:
            label = 1
        
       # print label
        #print y[i]
        if label == y[i]:
            accurate += 1
    print accurate
    print num_samples
    return accurate / float(num_samples)

# wipe gd_w
gradient_descent_logistic_reg(m_train, m_train_labels,40000,0.0001)
print (check_accuracy(m_test, m_test_labels))

# num_samples, num_features = wbc_train.shape        

# gd_w = np.random.uniform(size=num_features)
# gradient_descent_logistic_reg(wbc_train, wbc_train_labels)
# print (check_accuracy(wbc_test, wbc_test_labels))


# num_samples, num_features = news_train.shape        

# print "hit"
# gd_w = np.random.uniform(size=num_features)
# gradient_descent_logistic_reg(news_train.toarray(), news_train_labels,1,0.1)
# print (check_accuracy(news_test.toarray(), news_test_labels))

num_samples, num_features = cod_train.shape        

print "hit"
gd_w = np.random.uniform(size=num_features)
gradient_descent_logistic_reg(cod_train.toarray(), cod_train_labels,1000,0.001)
print (check_accuracy(cod_test.toarray(), cod_test_labels))



Loss on epoch 0: nan
Loss on epoch 100: 8.868913
Loss on epoch 200: 1.666953
Loss on epoch 300: 0.709468
Loss on epoch 400: 0.708417
Loss on epoch 500: 0.707406
Loss on epoch 600: 0.706401
Loss on epoch 700: 0.705403
Loss on epoch 800: 0.704411
Loss on epoch 900: 0.703425
Loss on epoch 1000: 0.702447
Loss on epoch 1100: 0.701475
Loss on epoch 1200: 0.700510
Loss on epoch 1300: 0.699552
Loss on epoch 1400: 0.698601
Loss on epoch 1500: 0.697657
Loss on epoch 1600: 0.696720
Loss on epoch 1700: 0.695791
Loss on epoch 1800: 0.694869
Loss on epoch 1900: 0.693954
Loss on epoch 2000: 0.693047
Loss on epoch 2100: 0.692148
Loss on epoch 2200: 0.691257
Loss on epoch 2300: 0.690374
Loss on epoch 2400: 0.689499
Loss on epoch 2500: 0.688631
Loss on epoch 2600: 0.687772
Loss on epoch 2700: 0.686921
Loss on epoch 2800: 0.686078
Loss on epoch 2900: 0.685243
Loss on epoch 3000: 0.684417
Loss on epoch 3100: 0.683598
Loss on epoch 3200: 0.682788
Loss on epoch 3300: 0.681986
Loss on epoch 3400: 0.681191
Lo

Loss on epoch 30800: 0.584893
Loss on epoch 30900: 0.584757
Loss on epoch 31000: 0.584621
Loss on epoch 31100: 0.584486
Loss on epoch 31200: 0.584351
Loss on epoch 31300: 0.584218
Loss on epoch 31400: 0.584085
Loss on epoch 31500: 0.583953
Loss on epoch 31600: 0.583821
Loss on epoch 31700: 0.583691
Loss on epoch 31800: 0.583561
Loss on epoch 31900: 0.583431
Loss on epoch 32000: 0.583303
Loss on epoch 32100: 0.583175
Loss on epoch 32200: 0.583048
Loss on epoch 32300: 0.582922
Loss on epoch 32400: 0.582796
Loss on epoch 32500: 0.582671
Loss on epoch 32600: 0.582546
Loss on epoch 32700: 0.582423
Loss on epoch 32800: 0.582300
Loss on epoch 32900: 0.582177
Loss on epoch 33000: 0.582055
Loss on epoch 33100: 0.581934
Loss on epoch 33200: 0.581814
Loss on epoch 33300: 0.581694
Loss on epoch 33400: 0.581575
Loss on epoch 33500: 0.581457
Loss on epoch 33600: 0.581339
Loss on epoch 33700: 0.581221
Loss on epoch 33800: 0.581105
Loss on epoch 33900: 0.580989
Loss on epoch 34000: 0.580873
Loss on ep