In [417]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [418]:
df = pd.read_csv('winequalityN.csv')
df = df.dropna()

In [419]:
df.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [420]:
trainRed,testRed = train_test_split(df[df['type'] == 'red'],shuffle = True,test_size=.2)
trainWhite,testWhite = train_test_split(df[df['type'] == 'white'],shuffle = True,test_size=.2)

In [421]:
len(trainRed)
len(trainWhite)

train = pd.concat([trainWhite,trainRed])
test = pd.concat([testWhite,testRed])


#Map conversion where white is 1 and red is 0 
mapping = {'white' : 1, 'red' : 0}
train = train.replace({'type' : mapping})
test = test.replace({'type': mapping})

#Get True Prediction data
ytrain = train['type']
ytest = test['type']

#Drop subjective features
train = train.drop(['type','quality'],axis = 1)
test = test.drop(['type','quality'],axis = 1)

#Normalize the data
train=(train-train.mean())/train.std()
test = (test - test.mean())/test.std()

#Add ones column
train = np.append(np.ones(len(train)).reshape((-1,1)),train,axis=1)
test = np.append(np.ones(len(test)).reshape((-1,1)),test,axis = 1)

#Convert to numpy matrix
ytrain = ytrain.values
ytest = ytest.values

display(test)


array([[ 1.        , -0.49344855, -0.78321125, ...,  0.42129853,
        -0.94046945, -1.5052766 ],
       [ 1.        , -1.028804  , -0.36052655, ...,  0.61776608,
        -0.07387856,  0.91191151],
       [ 1.        ,  0.42430365,  0.30369225, ..., -1.1504418 ,
        -0.74048694,  0.32845231],
       ...,
       [ 1.        ,  0.0419069 ,  0.66599342, ...,  0.74874444,
         1.99260741, -0.00495295],
       [ 1.        ,  0.57726234,  1.57174633, ...,  0.5522769 ,
        -0.27386107, -0.50506083],
       [ 1.        ,  0.57726234,  0.96791105, ...,  1.40363625,
         0.59272981,  0.16174968]])

In [422]:
def Sigmoid(z):
    return 1/(1+np.exp(-z))

def cost(X,y,theta):
    m = len(y)
    return 1/m * (-y.dot(np.log(Sigmoid(X.dot(theta)))) - (1-y).dot(np.log(1-Sigmoid(X.dot(theta)))))

def gradient_descent(X,y,theta):

    m = len(y)
    num_iters = 4000
    alpha = .03

    for i in range(num_iters):
        print('Cost Function at Iteration %s : %s'%(str(i),str( cost(X,y,theta))))
        theta = theta - alpha/m * (Sigmoid(X.dot(theta)) - y).dot(X)
    return theta


In [423]:
theta = [0,0,0,0,0,0,0,0,0,0,0,0]

In [424]:
new_thetas = gradient_descent(train,ytrain,theta)

Cost Function at Iteration 0 : 0.6931471805599464
Cost Function at Iteration 1 : 0.6787629743294333
Cost Function at Iteration 2 : 0.6649465592355702
Cost Function at Iteration 3 : 0.6516729900608289
Cost Function at Iteration 4 : 0.6389178433512975
Cost Function at Iteration 5 : 0.6266573348219804
Cost Function at Iteration 6 : 0.614868413494699
Cost Function at Iteration 7 : 0.6035288334536549
Cost Function at Iteration 8 : 0.5926172049160083
Cost Function at Iteration 9 : 0.5821130268495237
Cost Function at Iteration 10 : 0.5719967036436643
Cost Function at Iteration 11 : 0.5622495483997862
Cost Function at Iteration 12 : 0.5528537753067604
Cost Function at Iteration 13 : 0.5437924833657558
Cost Function at Iteration 14 : 0.5350496334688719
Cost Function at Iteration 15 : 0.5266100205565221
Cost Function at Iteration 16 : 0.5184592423025257
Cost Function at Iteration 17 : 0.5105836655190817
Cost Function at Iteration 18 : 0.5029703912442627
Cost Function at Iteration 19 : 0.49560721

Cost Function at Iteration 276 : 0.13973480962123672
Cost Function at Iteration 277 : 0.1394566000220598
Cost Function at Iteration 278 : 0.13918003786241928
Cost Function at Iteration 279 : 0.13890510808823603
Cost Function at Iteration 280 : 0.13863179583080676
Cost Function at Iteration 281 : 0.1383600864039386
Cost Function at Iteration 282 : 0.13808996530113668
Cost Function at Iteration 283 : 0.1378214181928439
Cost Function at Iteration 284 : 0.13755443092373165
Cost Function at Iteration 285 : 0.13728898951004037
Cost Function at Iteration 286 : 0.13702508013696874
Cost Function at Iteration 287 : 0.136762689156111
Cost Function at Iteration 288 : 0.13650180308294055
Cost Function at Iteration 289 : 0.13624240859433936
Cost Function at Iteration 290 : 0.13598449252617267
Cost Function at Iteration 291 : 0.13572804187090662
Cost Function at Iteration 292 : 0.13547304377526945
Cost Function at Iteration 293 : 0.1352194855379544
Cost Function at Iteration 294 : 0.13496735460736395

Cost Function at Iteration 499 : 0.10210377607487248
Cost Function at Iteration 500 : 0.1020010475224837
Cost Function at Iteration 501 : 0.10189867118637376
Cost Function at Iteration 502 : 0.10179664519543838
Cost Function at Iteration 503 : 0.10169496769201285
Cost Function at Iteration 504 : 0.10159363683175048
Cost Function at Iteration 505 : 0.10149265078350238
Cost Function at Iteration 506 : 0.10139200772919908
Cost Function at Iteration 507 : 0.10129170586373223
Cost Function at Iteration 508 : 0.10119174339483895
Cost Function at Iteration 509 : 0.10109211854298636
Cost Function at Iteration 510 : 0.10099282954125799
Cost Function at Iteration 511 : 0.1008938746352412
Cost Function at Iteration 512 : 0.10079525208291576
Cost Function at Iteration 513 : 0.1006969601545437
Cost Function at Iteration 514 : 0.10059899713256036
Cost Function at Iteration 515 : 0.10050136131146643
Cost Function at Iteration 516 : 0.1004040509977215
Cost Function at Iteration 517 : 0.100307064509638

Cost Function at Iteration 766 : 0.08319934203101279
Cost Function at Iteration 767 : 0.08315044436226948
Cost Function at Iteration 768 : 0.08310165799261042
Cost Function at Iteration 769 : 0.08305298252902714
Cost Function at Iteration 770 : 0.0830044175803914
Cost Function at Iteration 771 : 0.08295596275744346
Cost Function at Iteration 772 : 0.08290761767278144
Cost Function at Iteration 773 : 0.08285938194084963
Cost Function at Iteration 774 : 0.08281125517792769
Cost Function at Iteration 775 : 0.08276323700211953
Cost Function at Iteration 776 : 0.08271532703334226
Cost Function at Iteration 777 : 0.08266752489331561
Cost Function at Iteration 778 : 0.08261983020555097
Cost Function at Iteration 779 : 0.0825722425953408
Cost Function at Iteration 780 : 0.08252476168974779
Cost Function at Iteration 781 : 0.08247738711759467
Cost Function at Iteration 782 : 0.08243011850945324
Cost Function at Iteration 783 : 0.08238295549763462
Cost Function at Iteration 784 : 0.0823358977161

Cost Function at Iteration 1021 : 0.0735098423153329
Cost Function at Iteration 1022 : 0.07348030730338011
Cost Function at Iteration 1023 : 0.07345082318717888
Cost Function at Iteration 1024 : 0.07342138983073003
Cost Function at Iteration 1025 : 0.07339200709852745
Cost Function at Iteration 1026 : 0.07336267485555523
Cost Function at Iteration 1027 : 0.07333339296728578
Cost Function at Iteration 1028 : 0.07330416129967789
Cost Function at Iteration 1029 : 0.07327497971917385
Cost Function at Iteration 1030 : 0.07324584809269805
Cost Function at Iteration 1031 : 0.07321676628765382
Cost Function at Iteration 1032 : 0.07318773417192223
Cost Function at Iteration 1033 : 0.0731587516138594
Cost Function at Iteration 1034 : 0.0731298184822942
Cost Function at Iteration 1035 : 0.07310093464652644
Cost Function at Iteration 1036 : 0.07307209997632483
Cost Function at Iteration 1037 : 0.07304331434192432
Cost Function at Iteration 1038 : 0.0730145776140246
Cost Function at Iteration 1039 

Cost Function at Iteration 1289 : 0.06706741274146605
Cost Function at Iteration 1290 : 0.06704784920116848
Cost Function at Iteration 1291 : 0.06702831253414711
Cost Function at Iteration 1292 : 0.06700880268323024
Cost Function at Iteration 1293 : 0.0669893195914106
Cost Function at Iteration 1294 : 0.06696986320184595
Cost Function at Iteration 1295 : 0.06695043345785681
Cost Function at Iteration 1296 : 0.06693103030292806
Cost Function at Iteration 1297 : 0.06691165368070652
Cost Function at Iteration 1298 : 0.06689230353500099
Cost Function at Iteration 1299 : 0.06687297980978184
Cost Function at Iteration 1300 : 0.0668536824491805
Cost Function at Iteration 1301 : 0.06683441139748819
Cost Function at Iteration 1302 : 0.06681516659915614
Cost Function at Iteration 1303 : 0.06679594799879449
Cost Function at Iteration 1304 : 0.06677675554117181
Cost Function at Iteration 1305 : 0.06675758917121524
Cost Function at Iteration 1306 : 0.06673844883400855
Cost Function at Iteration 130

Cost Function at Iteration 1557 : 0.06263357010107784
Cost Function at Iteration 1558 : 0.0626195840626566
Cost Function at Iteration 1559 : 0.0626056140070777
Cost Function at Iteration 1560 : 0.0625916599060914
Cost Function at Iteration 1561 : 0.0625777217315168
Cost Function at Iteration 1562 : 0.06256379945523924
Cost Function at Iteration 1563 : 0.06254989304921216
Cost Function at Iteration 1564 : 0.06253600248545618
Cost Function at Iteration 1565 : 0.06252212773605804
Cost Function at Iteration 1566 : 0.06250826877317184
Cost Function at Iteration 1567 : 0.062494425569018124
Cost Function at Iteration 1568 : 0.06248059809588372
Cost Function at Iteration 1569 : 0.06246678632612098
Cost Function at Iteration 1570 : 0.06245299023214904
Cost Function at Iteration 1571 : 0.06243920978645166
Cost Function at Iteration 1572 : 0.06242544496157937
Cost Function at Iteration 1573 : 0.06241169573014713
Cost Function at Iteration 1574 : 0.06239796206483469
Cost Function at Iteration 1575

Cost Function at Iteration 1840 : 0.05922285578044388
Cost Function at Iteration 1841 : 0.05921247421027675
Cost Function at Iteration 1842 : 0.059202102721938064
Cost Function at Iteration 1843 : 0.0591917413003047
Cost Function at Iteration 1844 : 0.059181389930283435
Cost Function at Iteration 1845 : 0.05917104859681319
Cost Function at Iteration 1846 : 0.05916071728486189
Cost Function at Iteration 1847 : 0.0591503959794284
Cost Function at Iteration 1848 : 0.05914008466554176
Cost Function at Iteration 1849 : 0.05912978332826311
Cost Function at Iteration 1850 : 0.059119491952680826
Cost Function at Iteration 1851 : 0.05910921052391699
Cost Function at Iteration 1852 : 0.05909893902712041
Cost Function at Iteration 1853 : 0.05908867744747258
Cost Function at Iteration 1854 : 0.05907842577018313
Cost Function at Iteration 1855 : 0.0590681839804922
Cost Function at Iteration 1856 : 0.05905795206367088
Cost Function at Iteration 1857 : 0.05904773000501846
Cost Function at Iteration 1

Cost Function at Iteration 2112 : 0.056726507564580635
Cost Function at Iteration 2113 : 0.056718397741862714
Cost Function at Iteration 2114 : 0.056710294804208
Cost Function at Iteration 2115 : 0.05670219874259491
Cost Function at Iteration 2116 : 0.056694109548019025
Cost Function at Iteration 2117 : 0.056686027211491456
Cost Function at Iteration 2118 : 0.05667795172403926
Cost Function at Iteration 2119 : 0.056669883076705105
Cost Function at Iteration 2120 : 0.0566618212605481
Cost Function at Iteration 2121 : 0.05665376626664304
Cost Function at Iteration 2122 : 0.05664571808608044
Cost Function at Iteration 2123 : 0.05663767670996561
Cost Function at Iteration 2124 : 0.05662964212942125
Cost Function at Iteration 2125 : 0.05662161433558454
Cost Function at Iteration 2126 : 0.05661359331960947
Cost Function at Iteration 2127 : 0.05660557907266445
Cost Function at Iteration 2128 : 0.05659757158593254
Cost Function at Iteration 2129 : 0.056589570850616396
Cost Function at Iteratio

Cost Function at Iteration 2365 : 0.05487254447098481
Cost Function at Iteration 2366 : 0.05486592641703551
Cost Function at Iteration 2367 : 0.0548593133943143
Cost Function at Iteration 2368 : 0.05485270539691947
Cost Function at Iteration 2369 : 0.05484610241896112
Cost Function at Iteration 2370 : 0.05483950445455862
Cost Function at Iteration 2371 : 0.05483291149783947
Cost Function at Iteration 2372 : 0.054826323542940855
Cost Function at Iteration 2373 : 0.05481974058401083
Cost Function at Iteration 2374 : 0.054813162615204455
Cost Function at Iteration 2375 : 0.05480658963068738
Cost Function at Iteration 2376 : 0.05480002162463326
Cost Function at Iteration 2377 : 0.05479345859122777
Cost Function at Iteration 2378 : 0.054786900524661335
Cost Function at Iteration 2379 : 0.05478034741913714
Cost Function at Iteration 2380 : 0.054773799268867586
Cost Function at Iteration 2381 : 0.05476725606807124
Cost Function at Iteration 2382 : 0.054760717810978035
Cost Function at Iterati

Cost Function at Iteration 2642 : 0.05321301875397444
Cost Function at Iteration 2643 : 0.053207597642147136
Cost Function at Iteration 2644 : 0.053202180228763045
Cost Function at Iteration 2645 : 0.05319676650992961
Cost Function at Iteration 2646 : 0.05319135648176348
Cost Function at Iteration 2647 : 0.05318595014038083
Cost Function at Iteration 2648 : 0.05318054748191022
Cost Function at Iteration 2649 : 0.05317514850248104
Cost Function at Iteration 2650 : 0.05316975319823013
Cost Function at Iteration 2651 : 0.053164361565298024
Cost Function at Iteration 2652 : 0.053158973599831724
Cost Function at Iteration 2653 : 0.05315358929798691
Cost Function at Iteration 2654 : 0.05314820865591792
Cost Function at Iteration 2655 : 0.0531428316697902
Cost Function at Iteration 2656 : 0.05313745833577294
Cost Function at Iteration 2657 : 0.05313208865003994
Cost Function at Iteration 2658 : 0.05312672260877071
Cost Function at Iteration 2659 : 0.053121360208153307
Cost Function at Iterati

Cost Function at Iteration 2929 : 0.051795166092723006
Cost Function at Iteration 2930 : 0.05179066585572666
Cost Function at Iteration 2931 : 0.05178616839393539
Cost Function at Iteration 2932 : 0.05178167370471353
Cost Function at Iteration 2933 : 0.05177718178542251
Cost Function at Iteration 2934 : 0.05177269263342809
Cost Function at Iteration 2935 : 0.051768206246102844
Cost Function at Iteration 2936 : 0.051763722620819914
Cost Function at Iteration 2937 : 0.05175924175495686
Cost Function at Iteration 2938 : 0.05175476364589117
Cost Function at Iteration 2939 : 0.05175028829101085
Cost Function at Iteration 2940 : 0.05174581568769987
Cost Function at Iteration 2941 : 0.05174134583334831
Cost Function at Iteration 2942 : 0.051736878725351064
Cost Function at Iteration 2943 : 0.051732414361106774
Cost Function at Iteration 2944 : 0.05172795273801374
Cost Function at Iteration 2945 : 0.051723493853475966
Cost Function at Iteration 2946 : 0.05171903770490145
Cost Function at Itera

Cost Function at Iteration 3200 : 0.050669158665170834
Cost Function at Iteration 3201 : 0.05066532395864901
Cost Function at Iteration 3202 : 0.05066149142005566
Cost Function at Iteration 3203 : 0.05065766104749099
Cost Function at Iteration 3204 : 0.05065383283907239
Cost Function at Iteration 3205 : 0.05065000679290939
Cost Function at Iteration 3206 : 0.050646182907118406
Cost Function at Iteration 3207 : 0.05064236117981747
Cost Function at Iteration 3208 : 0.05063854160912476
Cost Function at Iteration 3209 : 0.05063472419316334
Cost Function at Iteration 3210 : 0.05063090893005596
Cost Function at Iteration 3211 : 0.05062709581793033
Cost Function at Iteration 3212 : 0.05062328485491302
Cost Function at Iteration 3213 : 0.05061947603913823
Cost Function at Iteration 3214 : 0.05061566936873369
Cost Function at Iteration 3215 : 0.05061186484183713
Cost Function at Iteration 3216 : 0.050608062456585064
Cost Function at Iteration 3217 : 0.05060426221112008
Cost Function at Iteratio

Cost Function at Iteration 3472 : 0.04970020548604789
Cost Function at Iteration 3473 : 0.0496968974558424
Cost Function at Iteration 3474 : 0.0496935911513702
Cost Function at Iteration 3475 : 0.0496902865712367
Cost Function at Iteration 3476 : 0.04968698371405542
Cost Function at Iteration 3477 : 0.049683682578439824
Cost Function at Iteration 3478 : 0.04968038316300089
Cost Function at Iteration 3479 : 0.04967708546635632
Cost Function at Iteration 3480 : 0.049673789487125086
Cost Function at Iteration 3481 : 0.04967049522392178
Cost Function at Iteration 3482 : 0.04966720267536813
Cost Function at Iteration 3483 : 0.04966391184008596
Cost Function at Iteration 3484 : 0.049660622716703164
Cost Function at Iteration 3485 : 0.049657335303833
Cost Function at Iteration 3486 : 0.0496540496001118
Cost Function at Iteration 3487 : 0.04965076560416141
Cost Function at Iteration 3488 : 0.04964748331461322
Cost Function at Iteration 3489 : 0.04964420273009871
Cost Function at Iteration 3490

Cost Function at Iteration 3671 : 0.04907419820787215
Cost Function at Iteration 3672 : 0.04907120803636746
Cost Function at Iteration 3673 : 0.049068219341184355
Cost Function at Iteration 3674 : 0.04906523212119738
Cost Function at Iteration 3675 : 0.049062246375278805
Cost Function at Iteration 3676 : 0.049059262102304386
Cost Function at Iteration 3677 : 0.049056279301154
Cost Function at Iteration 3678 : 0.04905329797070539
Cost Function at Iteration 3679 : 0.049050318109836205
Cost Function at Iteration 3680 : 0.049047339717425587
Cost Function at Iteration 3681 : 0.049044362792357674
Cost Function at Iteration 3682 : 0.049041387333515915
Cost Function at Iteration 3683 : 0.049038413339781865
Cost Function at Iteration 3684 : 0.04903544081004101
Cost Function at Iteration 3685 : 0.049032469743180124
Cost Function at Iteration 3686 : 0.04902950013808319
Cost Function at Iteration 3687 : 0.049026531993644644
Cost Function at Iteration 3688 : 0.049023565308749135
Cost Function at It

Cost Function at Iteration 3937 : 0.048327572845415306
Cost Function at Iteration 3938 : 0.04832493898216716
Cost Function at Iteration 3939 : 0.04832230633220285
Cost Function at Iteration 3940 : 0.04831967489466532
Cost Function at Iteration 3941 : 0.04831704466868614
Cost Function at Iteration 3942 : 0.04831441565339822
Cost Function at Iteration 3943 : 0.04831178784794691
Cost Function at Iteration 3944 : 0.048309161251463
Cost Function at Iteration 3945 : 0.04830653586309177
Cost Function at Iteration 3946 : 0.04830391168197511
Cost Function at Iteration 3947 : 0.048301288707247314
Cost Function at Iteration 3948 : 0.04829866693805285
Cost Function at Iteration 3949 : 0.04829604637353837
Cost Function at Iteration 3950 : 0.04829342701284541
Cost Function at Iteration 3951 : 0.048290808855113485
Cost Function at Iteration 3952 : 0.04828819189949259
Cost Function at Iteration 3953 : 0.04828557614512631
Cost Function at Iteration 3954 : 0.04828296159116733
Cost Function at Iteration 

In [425]:
new_thetas

array([ 2.81134478e+00, -9.51192275e-01, -1.10777953e+00,  2.99401004e-01,
        9.22958115e-01, -7.80098980e-01, -1.24700950e-03,  1.93420492e+00,
       -1.25468681e+00, -7.11269717e-01, -7.67530886e-01, -2.02009590e-01])

In [426]:
prediction = Sigmoid(test.dot(new_thetas))
prediction = np.round(prediction)

print('Percent Accuracy : %s'%(str(np.sum(prediction == ytest)/len(ytest))))


Percent Accuracy : 0.9938128383604021


In [427]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(train,ytrain)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [431]:
skpred = logreg.predict(test)
print('Accuracy of Sklearn s logistic regression classifier on test set: %s'% (logreg.score(test, ytest)))

Accuracy of Sklearn s logistic regression classifier on test set: 0.9953596287703016


In [434]:
#Turns out the SKLearn classifier was slightly better than my classifier (Probably because I could have 
#Increased the learning rate, or let the classifier run for more iterations.

0.9953596287703016