# Homework 3 Code

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

 ## Find test error

The `find_test_error` function computes the test error of a linear classifier $w$. 

The hypothesis is assumed to be of the form $sign([1, x(N,:)] \cdot w)$.

Inputs:
* `w` is the weight vector
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)

Outputs:
* `test_error` is the binary error of $w$ on the data set $(X, y)$ error; this should be between 0 and 1. 

In [8]:
def find_test_error(w, X, y):
    X = [[1] + x_n for x_n in X]
    classifications = [np.sign(np.dot(x_n, w)) for x_n in X]
    print("Datetime end", datetime.now())
    return sum([classification != y_n for classification, y_n in list(zip(classifications, Y))]) / len(Y)

In [9]:
def sigmoid(s):
    return 1 / (1 + np.exp(-s))

 ## Logistic Regression

The `logistic_reg`  learn a logistic regression model using gradient descent.

Inputs:
* `X` is the data matrix (without an initial column of 1's)
* `y` are the data labels (plus or minus 1)
* `w_init` is the initial value of the w vector ($d+1$ dimensional)
* `max_its` is the maximum number of iterations to run for
* `eta` is the learning rate

Outputs:
* t is the number of iterations gradient descent ran for
* w is the learned weight vector
* e_in is the in-sample (cross-entropy) error 

In [59]:
def logistic_reg(X, y, w_init, max_its, eta):

    dim_data = len(w_init)
    X = [[1] + x_n for x_n in X]
    w = np.array(w_init)
    w_grad = [1 for i in range(len(w))]
    t = 0
    e_in = 10000
    y = np.array(y)
    while not weight_break(w_grad) and t < max_its:
        t += 1

        cross_entropy = [calc_grad_cross(tup[0], tup[1], w) for tup in list(zip(y, X))]

        yn_xns = [tup[0] * np.array(tup[1]) for tup in list(zip(y, X))]
        summation_terms = [ tup[0]/tup[1] for tup in list(zip(yn_xns, cross_entropy))]

        w_grad = (-1 / len(y)) * np.array([sum(position) for position in zip(*summation_terms)])
        w = w - (eta * w_grad)
    e_in = sum([np.log(calc_cross(tup[0], tup[1], w)) for tup in list(zip(y, X))]) / len(y)
    return t, list(w), e_in

## Run and Plot

Run your code and plot figures below

In [87]:
for eta in [0.01, 0.1, 1, 4, 5, 6, 7, 7.5, 7.6, 7.65]:
    print("ETA:", eta)
    data = pd.read_csv("CSV_files/cleveland_train.csv")
    mat = data.to_numpy()
    X = []
    Y = []
    for row in mat:
        X.append(list(row[:-1]))
        if row[-1] == 0:
            Y.append(-1)
        else:
            Y.append(1)
    trials, weights, e_in = logistic_reg(X, Y, w_init=np.zeros(len(X[0])+1), max_its=sys.maxsize, eta=eta)
    print("in sample CE error", e_in, "trials", trials)
    print("in sample binary error", find_test_error(weights, X, Y))

    test_data = pd.read_csv("CSV_files/cleveland_test.csv")
    mat = test_data.to_numpy()
    X = []
    Y = []
    for row in mat:
        X.append(list(row[:-1]))
        if row[-1] == 0:
            Y.append(-1)
        else:
            Y.append(1)

    print("out of sample error", find_test_error(weights, X, Y))

ETA: 0.01
Datetime start 2020-11-07 10:56:33.444911
in sample CE error 0.40738144657177583 trials 17603
Datetime end 2020-11-07 10:57:27.480690
in sample binary error 0.5657894736842105
Datetime end 2020-11-07 10:57:27.488761
out of sample error 0.5103448275862069
ETA: 0.1
Datetime start 2020-11-07 10:57:27.495494
in sample CE error 0.4073814465182788 trials 1758
Datetime end 2020-11-07 10:57:33.079354
in sample binary error 0.5657894736842105
Datetime end 2020-11-07 10:57:33.086359
out of sample error 0.5103448275862069
ETA: 1
Datetime start 2020-11-07 10:57:33.091831
in sample CE error 0.4073814457865927 trials 174
Datetime end 2020-11-07 10:57:33.569050
in sample binary error 0.5657894736842105
Datetime end 2020-11-07 10:57:33.576409
out of sample error 0.5103448275862069
ETA: 4
Datetime start 2020-11-07 10:57:33.582131
in sample CE error 0.40738144394994685 trials 41
Datetime end 2020-11-07 10:57:33.700714
in sample binary error 0.5657894736842105
Datetime end 2020-11-07 10:57:33.7

In [40]:
for eta in [1e-5]:
    print("ETA:", eta)
    data = pd.read_csv("CSV_files/cleveland_train.csv")
    mat = data.to_numpy()
    X = []
    Y = []
    for row in mat:
        X.append(list(row[:-1]))
        if row[-1] == 0:
            Y.append(-1)
        else:
            Y.append(1)
    trials, weights, e_in = logistic_reg(X, Y, w_init=np.zeros(len(X[0])+1), max_its=1e4, eta=eta)
    print("in sample CE error", e_in, "trials", trials)
    print("in sample binary error", find_test_error(weights, X, Y))

    test_data = pd.read_csv("CSV_files/cleveland_test.csv")
    mat = test_data.to_numpy()
    X = []
    Y = []
    for row in mat:
        X.append(list(row[:-1]))
        if row[-1] == 0:
            Y.append(-1)
        else:
            Y.append(1)

    print("out of sample error", find_test_error(weights, X, Y))

ETA: 1e-05
Datetime start 2020-11-11 17:24:22.965955
eta check 1e-05
ce shape (1, 152)
shape y (152,)


TypeError: list indices must be integers or slices, not tuple

In [50]:
data = pd.read_csv("CSV_files/cleveland_train.csv")
mat = data.to_numpy()
X = []
Y = []
for row in mat:
    X.append(list(row[:-1]))
    if row[-1] == 0:
        Y.append(-1)
    else:
        Y.append(1)
trials, weights, e_in = logistic_reg(X, Y, w_init=np.zeros(len(X[0])+1), max_its=1e5, eta=1e-5)

print("in sample error", find_test_error(weights, X, Y))

test_data = pd.read_csv("CSV_files/cleveland_test.csv")
mat = test_data.to_numpy()
X = []
Y = []
for row in mat:
    X.append(list(row[:-1]))
    if row[-1] == 0:
        Y.append(-1)
    else:
        Y.append(1)

print("out of sample error", find_test_error(weights, X, Y))

iteration 1000 error 0.6168397851087525 max grad 0.8728881375876335 2020-11-05 19:27:51.112496
iteration 2000 error 0.6052190196821822 max grad 0.41599235891723774 2020-11-05 19:27:54.141373
iteration 3000 error 0.6008909148162558 max grad 0.4000556186401782 2020-11-05 19:27:57.078986
iteration 4000 error 0.5980104119461269 max grad 0.3905300853136982 2020-11-05 19:28:00.059563
iteration 5000 error 0.5955244232279215 max grad 0.3837877284228102 2020-11-05 19:28:03.379042
iteration 6000 error 0.5932012875049291 max grad 0.37836684864615305 2020-11-05 19:28:06.873210
iteration 7000 error 0.5909777939010693 max grad 0.373623043367613 2020-11-05 19:28:10.034599
iteration 8000 error 0.588830188450616 max grad 0.36925609802623904 2020-11-05 19:28:14.662561
iteration 9000 error 0.5867458785448857 max grad 0.3651203865082963 2020-11-05 19:28:19.171370
iteration 10000 error 0.5847165561996042 max grad 0.36114228687022576 2020-11-05 19:28:22.293884
iteration 11000 error 0.5827361933526978 max gr

iteration 87000 error 0.5002989921386515 max grad 0.16963628458117166 2020-11-05 19:32:26.435480
iteration 88000 error 0.4997464542054716 max grad 0.16802590901951503 2020-11-05 19:32:29.267282
iteration 89000 error 0.49920193507954397 max grad 0.1664318731356505 2020-11-05 19:32:32.261224
iteration 90000 error 0.4986652907443623 max grad 0.1648539976361181 2020-11-05 19:32:35.531772
iteration 91000 error 0.49813638000120947 max grad 0.16329210490942636 2020-11-05 19:32:38.648838
iteration 92000 error 0.4976150644128563 max grad 0.1617460190324594 2020-11-05 19:32:41.708193
iteration 93000 error 0.49710120824827975 max grad 0.16021556577564075 2020-11-05 19:32:44.633190
iteration 94000 error 0.4965946784283976 max grad 0.1587005726068928 2020-11-05 19:32:47.580428
iteration 95000 error 0.49609534447279807 max grad 0.15720086869444302 2020-11-05 19:32:50.522514
iteration 96000 error 0.495603078447468 max grad 0.15571628490853082 2020-11-05 19:32:53.466010
iteration 97000 error 0.4951177

In [51]:
data = pd.read_csv("CSV_files/cleveland_train.csv")
mat = data.to_numpy()
X = []
Y = []
for row in mat:
    X.append(list(row[:-1]))
    if row[-1] == 0:
        Y.append(-1)
    else:
        Y.append(1)
trials, weights, e_in = logistic_reg(X, Y, w_init=np.zeros(len(X[0])+1), max_its=1e6, eta=1e-5)

print("in sample error", find_test_error(weights, X, Y))

test_data = pd.read_csv("CSV_files/cleveland_test.csv")
mat = test_data.to_numpy()
X = []
Y = []
for row in mat:
    X.append(list(row[:-1]))
    if row[-1] == 0:
        Y.append(-1)
    else:
        Y.append(1)

print("out of sample error", find_test_error(weights, X, Y))

iteration 1000 error 0.6168397851087525 max grad 0.8728881375876335 2020-11-05 19:33:42.471124
iteration 2000 error 0.6052190196821822 max grad 0.41599235891723774 2020-11-05 19:33:45.478345
iteration 3000 error 0.6008909148162558 max grad 0.4000556186401782 2020-11-05 19:33:49.734109
iteration 4000 error 0.5980104119461269 max grad 0.3905300853136982 2020-11-05 19:33:54.384690
iteration 5000 error 0.5955244232279215 max grad 0.3837877284228102 2020-11-05 19:33:58.301217
iteration 6000 error 0.5932012875049291 max grad 0.37836684864615305 2020-11-05 19:34:02.559114
iteration 7000 error 0.5909777939010693 max grad 0.373623043367613 2020-11-05 19:34:05.865368
iteration 8000 error 0.588830188450616 max grad 0.36925609802623904 2020-11-05 19:34:09.041041
iteration 9000 error 0.5867458785448857 max grad 0.3651203865082963 2020-11-05 19:34:12.080166
iteration 10000 error 0.5847165561996042 max grad 0.36114228687022576 2020-11-05 19:34:15.398133
iteration 11000 error 0.5827361933526978 max gr

iteration 87000 error 0.5002989921386515 max grad 0.16963628458117166 2020-11-05 19:38:35.055216
iteration 88000 error 0.4997464542054716 max grad 0.16802590901951503 2020-11-05 19:38:38.456432
iteration 89000 error 0.49920193507954397 max grad 0.1664318731356505 2020-11-05 19:38:41.379310
iteration 90000 error 0.4986652907443623 max grad 0.1648539976361181 2020-11-05 19:38:44.333601
iteration 91000 error 0.49813638000120947 max grad 0.16329210490942636 2020-11-05 19:38:47.157739
iteration 92000 error 0.4976150644128563 max grad 0.1617460190324594 2020-11-05 19:38:50.155844
iteration 93000 error 0.49710120824827975 max grad 0.16021556577564075 2020-11-05 19:38:53.399159
iteration 94000 error 0.4965946784283976 max grad 0.1587005726068928 2020-11-05 19:38:56.567292
iteration 95000 error 0.49609534447279807 max grad 0.15720086869444302 2020-11-05 19:38:59.486128
iteration 96000 error 0.495603078447468 max grad 0.15571628490853082 2020-11-05 19:39:02.453199
iteration 97000 error 0.4951177

iteration 171000 error 0.4721763144993628 max grad 0.07837032695558104 2020-11-05 19:43:04.339340
iteration 172000 error 0.47198177955163795 max grad 0.07818706435521583 2020-11-05 19:43:07.336153
iteration 173000 error 0.47178922000106455 max grad 0.07800496181585695 2020-11-05 19:43:10.331541
iteration 174000 error 0.47159860623461086 max grad 0.077824007640993 2020-11-05 19:43:13.255459
iteration 175000 error 0.47140990915562636 max grad 0.07764419027525805 2020-11-05 19:43:16.095907
iteration 176000 error 0.47122310017394203 max grad 0.07746549830265284 2020-11-05 19:43:18.919666
iteration 177000 error 0.4710381511961776 max grad 0.07728792044478307 2020-11-05 19:43:21.786967
iteration 178000 error 0.47085503461624434 max grad 0.07711144555911721 2020-11-05 19:43:24.656690
iteration 179000 error 0.47067372330604895 max grad 0.07693606263726234 2020-11-05 19:43:27.514410
iteration 180000 error 0.47049419060638337 max grad 0.07676176080325911 2020-11-05 19:43:30.490087
iteration 1810

iteration 255000 error 0.4606572702154806 max grad 0.06613048364102916 2020-11-05 19:47:38.107026
iteration 256000 error 0.46056040969992074 max grad 0.06601445785994947 2020-11-05 19:47:41.395622
iteration 257000 error 0.4604641830839992 max grad 0.06589896590053872 2020-11-05 19:47:44.605315
iteration 258000 error 0.46036858285976356 max grad 0.0657840033386971 2020-11-05 19:47:48.381858
iteration 259000 error 0.4602736016317996 max grad 0.06566956579798902 2020-11-05 19:47:51.796511
iteration 260000 error 0.4601792321153378 max grad 0.06555564894904765 2020-11-05 19:47:54.783927
iteration 261000 error 0.4600854671344029 max grad 0.06544224850898665 2020-11-05 19:47:58.103619
iteration 262000 error 0.4599922996199871 max grad 0.06532936024082009 2020-11-05 19:48:01.364815
iteration 263000 error 0.4598997226082653 max grad 0.06521697995289057 2020-11-05 19:48:04.556493
iteration 264000 error 0.45980772923883756 max grad 0.0651051034983056 2020-11-05 19:48:07.778886
iteration 265000 er

iteration 339000 error 0.45418565652478876 max grad 0.05790009962374857 2020-11-05 19:52:15.872132
iteration 340000 error 0.45412379067752306 max grad 0.05781709552472472 2020-11-05 19:52:19.050944
iteration 341000 error 0.454062190594683 max grad 0.05773437573783036 2020-11-05 19:52:22.374795
iteration 342000 error 0.45400085395201056 max grad 0.057651938361742205 2020-11-05 19:52:25.810078
iteration 343000 error 0.4539397784547797 max grad 0.0575697815127078 2020-11-05 19:52:30.038375
iteration 344000 error 0.45387896183735554 max grad 0.057487903324351684 2020-11-05 19:52:33.630744
iteration 345000 error 0.4538184018627647 max grad 0.057406301947482787 2020-11-05 19:52:36.739217
iteration 346000 error 0.45375809632227065 max grad 0.05732497554990544 2020-11-05 19:52:39.863650
iteration 347000 error 0.4536980430349536 max grad 0.057243922316231356 2020-11-05 19:52:42.996541
iteration 348000 error 0.45363823984730417 max grad 0.057163140447695146 2020-11-05 19:52:46.322420
iteration 3

iteration 422000 error 0.44978541437341935 max grad 0.05183102142482721 2020-11-05 19:56:59.790685
iteration 423000 error 0.44973977276861465 max grad 0.05176646220927924 2020-11-05 19:57:03.435978
iteration 424000 error 0.4496942744002633 max grad 0.05170207509818224 2020-11-05 19:57:06.436517
iteration 425000 error 0.4496489183816121 max grad 0.051637859161556776 2020-11-05 19:57:09.438925
iteration 426000 error 0.4496037038350577 max grad 0.05157381347680575 2020-11-05 19:57:13.300380
iteration 427000 error 0.44955862989202844 max grad 0.051509937128642824 2020-11-05 19:57:16.589595
iteration 428000 error 0.44951369569286376 max grad 0.05144622920902062 2020-11-05 19:57:19.999668
iteration 429000 error 0.4494689003866974 max grad 0.051382688817060984 2020-11-05 19:57:23.115749
iteration 430000 error 0.44942424313134255 max grad 0.051319315058985016 2020-11-05 19:57:26.359966
iteration 431000 error 0.4493797230931793 max grad 0.05125610704804533 2020-11-05 19:57:29.842130
iteration 4

iteration 506000 error 0.44638089595370856 max grad 0.04693170393718296 2020-11-05 20:01:38.230637
iteration 507000 error 0.4463448795353743 max grad 0.04687894565857553 2020-11-05 20:01:41.103872
iteration 508000 error 0.4463089549971864 max grad 0.04682630154461572 2020-11-05 20:01:43.976484
iteration 509000 error 0.44627312192749674 max grad 0.04677377109578392 2020-11-05 20:01:46.997606
iteration 510000 error 0.4462373799178952 max grad 0.0467213538159685 2020-11-05 20:01:50.241574
iteration 511000 error 0.4462017285631741 max grad 0.04666904921243554 2020-11-05 20:01:53.499189
iteration 512000 error 0.4461661674612913 max grad 0.04661685679580023 2020-11-05 20:01:56.751254
iteration 513000 error 0.4461306962133347 max grad 0.04656477607999855 2020-11-05 20:01:59.911738
iteration 514000 error 0.44609531442348765 max grad 0.04651280658225807 2020-11-05 20:02:03.602163
iteration 515000 error 0.4460600216989928 max grad 0.046460947823069774 2020-11-05 20:02:06.655581
iteration 516000 

iteration 590000 error 0.4436418742264174 max grad 0.04285616539285737 2020-11-05 20:06:21.374558
iteration 591000 error 0.4436123911552544 max grad 0.042811531582351056 2020-11-05 20:06:25.280832
iteration 592000 error 0.4435829739414567 max grad 0.04276697953914493 2020-11-05 20:06:29.142796
iteration 593000 error 0.4435536223532173 max grad 0.04272250897035543 2020-11-05 20:06:32.922480
iteration 594000 error 0.44352433616008197 max grad 0.04267811958481588 2020-11-05 20:06:36.917746
iteration 595000 error 0.4434951151329384 max grad 0.04263381109306256 2020-11-05 20:06:40.549417
iteration 596000 error 0.44346595904400204 max grad 0.04258958320732185 2020-11-05 20:06:44.329922
iteration 597000 error 0.44343686766680557 max grad 0.04254543564149821 2020-11-05 20:06:47.949624
iteration 598000 error 0.44340784077618656 max grad 0.042501368111160384 2020-11-05 20:06:51.513587
iteration 599000 error 0.4433788781482737 max grad 0.04245738033352909 2020-11-05 20:06:55.188608
iteration 6000

iteration 674000 error 0.4413750654683714 max grad 0.039367008160004324 2020-11-05 20:11:07.902065
iteration 675000 error 0.44135041506545436 max grad 0.03932836320787418 2020-11-05 20:11:10.967046
iteration 676000 error 0.44132581481031663 max grad 0.03928978034736863 2020-11-05 20:11:13.898800
iteration 677000 error 0.44130126455233115 max grad 0.03925125939325481 2020-11-05 20:11:16.762476
iteration 678000 error 0.4412767641415463 max grad 0.03921280016123033 2020-11-05 20:11:19.802938
iteration 679000 error 0.44125231342868376 max grad 0.0391744024679186 2020-11-05 20:11:22.850868
iteration 680000 error 0.4412279122651333 max grad 0.039136066130861465 2020-11-05 20:11:25.773563
iteration 681000 error 0.44120356050294535 max grad 0.039097790968513145 2020-11-05 20:11:28.639559
iteration 682000 error 0.4411792579948297 max grad 0.03905957680023392 2020-11-05 20:11:31.737439
iteration 683000 error 0.4411550045941483 max grad 0.039021423446284495 2020-11-05 20:11:34.996890
iteration 68

iteration 758000 error 0.4394660683835988 max grad 0.03632125323691112 2020-11-05 20:16:11.976769
iteration 759000 error 0.43944516175815657 max grad 0.03628725613186483 2020-11-05 20:16:16.445642
iteration 760000 error 0.4394242946049394 max grad 0.03625330828543667 2020-11-05 20:16:20.690940
iteration 761000 error 0.4394034668170347 max grad 0.03621940957256614 2020-11-05 20:16:24.808647
iteration 762000 error 0.4393826782879262 max grad 0.03618555986873174 2020-11-05 20:16:29.222781
iteration 763000 error 0.4393619289114932 max grad 0.036151759049948204 2020-11-05 20:16:32.903986
iteration 764000 error 0.4393412185820084 max grad 0.03611800699276327 2020-11-05 20:16:36.343087
iteration 765000 error 0.43932054719413405 max grad 0.036084303574254156 2020-11-05 20:16:39.609166
iteration 766000 error 0.43929991464292273 max grad 0.03605064867202483 2020-11-05 20:16:42.857275
iteration 767000 error 0.43927932082381266 max grad 0.03601704216420224 2020-11-05 20:16:46.132193
iteration 7680

iteration 842000 error 0.43783803037649444 max grad 0.033626243848198616 2020-11-05 20:20:43.436962
iteration 843000 error 0.4378201008334238 max grad 0.03359599422499147 2020-11-05 20:20:46.313904
iteration 844000 error 0.4378022029877121 max grad 0.03356578496328343 2020-11-05 20:20:49.210805
iteration 845000 error 0.43778433675959016 max grad 0.0335356159738235 2020-11-05 20:20:52.082221
iteration 846000 error 0.4377665020695507 max grad 0.03350548716769384 2020-11-05 20:20:54.969972
iteration 847000 error 0.4377486988383468 max grad 0.03347539845630403 2020-11-05 20:20:57.839235
iteration 848000 error 0.4377309269869902 max grad 0.033445349751392586 2020-11-05 20:21:00.741983
iteration 849000 error 0.4377131864367519 max grad 0.03341534096502484 2020-11-05 20:21:03.600101
iteration 850000 error 0.437695477109159 max grad 0.03338537200958968 2020-11-05 20:21:06.435983
iteration 851000 error 0.43767779892599495 max grad 0.033355442797798535 2020-11-05 20:21:09.341337
iteration 852000

iteration 926000 error 0.4364353166206112 max grad 0.031218060034341646 2020-11-05 20:24:52.630415
iteration 927000 error 0.43641979446772944 max grad 0.031190918097539575 2020-11-05 20:24:56.180437
iteration 928000 error 0.436404298125283 max grad 0.0311638100298897 2020-11-05 20:24:59.978592
iteration 929000 error 0.43638882753202546 max grad 0.03113673576470607 2020-11-05 20:25:04.109094
iteration 930000 error 0.4363733826268952 max grad 0.03110969523551684 2020-11-05 20:25:07.036129
iteration 931000 error 0.43635796334901383 max grad 0.03108268837606465 2020-11-05 20:25:10.106446
iteration 932000 error 0.4363425696376872 max grad 0.031055715120305097 2020-11-05 20:25:13.453623
iteration 933000 error 0.4363272014324037 max grad 0.031028775402405326 2020-11-05 20:25:16.447853
iteration 934000 error 0.43631185867283395 max grad 0.031001869156743894 2020-11-05 20:25:19.363040
iteration 935000 error 0.4362965412988293 max grad 0.0309749963179091 2020-11-05 20:25:22.308674
iteration 9360