In [1]:
import numpy as np

from sklearn import linear_model
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [627]:
trainfile = open('train.csv')

In [628]:
print trainfile

<open file 'train.csv', mode 'r' at 0x10caa3ed0>


In [629]:
for line in trainfile:
    head = line.rstrip().split(',')
    break

In [630]:
print head

print len(head)

['Choice', 'A_follower_count', 'A_following_count', 'A_listed_count', 'A_mentions_received', 'A_retweets_received', 'A_mentions_sent', 'A_retweets_sent', 'A_posts', 'A_network_feature_1', 'A_network_feature_2', 'A_network_feature_3', 'B_follower_count', 'B_following_count', 'B_listed_count', 'B_mentions_received', 'B_retweets_received', 'B_mentions_sent', 'B_retweets_sent', 'B_posts', 'B_network_feature_1', 'B_network_feature_2', 'B_network_feature_3']
23


In [631]:
X_train_A = []
X_train_B = []
y_train = []

In [632]:
for line in trainfile:
    splitted = line.rstrip().split(',')
    label = int(splitted[0])
    A_features = [float(item) for item in splitted[1:12]]
    B_features = [float(item) for item in splitted[12:]]
    y_train.append(label)
    X_train_A.append(A_features)
    X_train_B.append(B_features)
#     print A_features
trainfile.close()

In [633]:
y_train = np.array(y_train)
X_train_A = np.array(X_train_A)
X_train_B = np.array(X_train_B)

In [2]:
def transform_features(x):
    return np.log(1+x)

In [635]:
for i in range(len(y_train)):
    if y_train[i] == 0:
        y_train[i] = -1

In [636]:
X_train = transform_features(X_train_A) - transform_features(X_train_B)

In [3]:
def normalize(X):
    X_norm = X
    cols = X.shape[1]
    for i in range(cols):
        m = np.mean(X[:, i])
        std = np.std(X[:, i])
        X_norm[:, i] = (X[:, i] - m)/std
    return X_norm

In [438]:
np.random.seed(1)      #Seed the random number generator to preserve the dev/test split
permutation = np.random.permutation(X_train.shape[0])
X_train = X_train[permutation,]
y_train = y_train[permutation,]
X_train = normalize(X_train)

In [378]:
indices = np.random.permutation(X_train.shape[0])
training_idx, test_idx = indices[:5000], indices[5000:]
X_t = X_train[training_idx,:]
X_test = X_train[test_idx, :]
y_t = y_train[training_idx,]
y_test = y_train[test_idx,]

In [379]:
print len(X_t)
print len(X_test)
print len(y_t)

5000
500
5000


In [313]:
X_train = X_t
y_train = y_t

In [439]:
def ExpLoss(X, Y, W, lmda):
    loss = lmda * (W.dot(W))
    yHat = X.dot(W)
    activation = -Y * yHat
    activationExp = np.exp(activation)
    loss += np.sum(activationExp)
    return loss

In [440]:
def ExpLossGradient(x, y, W, lmda):
    grad = (x.dot(W))
    grad = -y * grad
    grad = np.exp(grad)
    grad = -y*x*grad
    Wgrad = 2 * lmda * W
    Wgrad = Wgrad + grad
    return Wgrad

In [441]:
from scipy.special import expit

def prediction(x, W):
    return expit(x.dot(W))

In [442]:
def updateWeights(gradient, learningRate, W):
    W = W - (learningRate * gradient)
    return W

In [443]:
def logisticRegression(X, Y, maxIter, learningRate, lmda):
    W = np.zeros(X.shape[1])
    newLoss = ExpLoss(X, Y, W, lmda)
    prevLoss = 0.0
    count = 0
    
    while (True):
        count += 1
        for i in range(len(Y)):
            #Not converged... so continue
            d = X[i,]
            gradient = ExpLossGradient(d, Y[i], W, lmda)
            W = updateWeights(gradient, learningRate, W)
        prevLoss = newLoss
#         print W
        newLoss = ExpLoss(X, Y, W, lmda)
#         print "Iteration # : ", count, " Loss Value : ", newLoss
        
#         if abs(newLoss - prevLoss) < 0.0001 :
# #             print "Difference in old and new loss less than ", 0.0001
# #             print "Total Iterations till now : ", count
# #             print "prevLoss", prevLoss
# #             print "newLoss", newLoss
#             break
        
        if count == maxIter :
#             print "MaxIterations reached!"
            break
    return W

In [656]:
def runExperiments(X, Y, X_test, Y_test, lmda, learningRate, maxIter = 10):
    W = logisticRegression(X, Y, maxIter, learningRate, lmda)
    nCorrect = 0
    nIncorrect = 0
    for i in range(len(Y_test)):
        y_hat = np.sign(X_test[i,].dot(W))
        if y_hat == Y_test[i]:
            nCorrect += 1
        else:
            nIncorrect += 1
    return (nCorrect * 1.0/(nIncorrect + nCorrect))

In [452]:
W = runExperiments(X_train, y_train, lmda=.0001, learningRate=.001, maxIter=100)

In [453]:
W

array([ 0.26679316,  0.01504388,  0.41644681, -0.4947088 ,  0.19144936,
        0.06897368,  0.09333077, -0.07127917,  0.6296666 ,  0.20083261,
       -0.04606182])

In [380]:
model = linear_model.LogisticRegression(fit_intercept=False)

In [381]:
model.fit(X_t,y_t)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=False,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [195]:
p_train = model.predict_proba(X_train)

In [196]:
y_pred1 = []
y_pred2 = []
for y in p_train:
    if y[0] >= y[1]:
        y_pred1.append(1)
        y_pred2.append(0)
    else:
        y_pred1.append(0)
        y_pred2.append(1)
# p_train = p_train[:,1:2]

# y_train.shape

In [100]:
print len(y_train)

5500


In [197]:
roc_curve(y_train, y_pred1)

(array([ 0.        ,  0.74045235,  1.        ]),
 array([ 0.        ,  0.25981442,  1.        ]),
 array([2, 1, 0]))

In [445]:
testfile = open('test.csv')
#ignore the test header
testfile.next()

X_test_A = []
X_test_B = []
Y_test = []
for line in testfile:
    splitted = line.rstrip().split(',')
    label = int(splitted[0])
    A_features = [float(item) for item in splitted[0:11]]
    B_features = [float(item) for item in splitted[11:]]
    Y_test.append(label)
    X_test_A.append(A_features)
    X_test_B.append(B_features)
testfile.close()

X_test_A = np.array(X_test_A)
X_test_B = np.array(X_test_B)

# transform features in the same way as for training to ensure consistency
X_test = transform_features(X_test_A) - transform_features(X_test_B)
X_test = normalize(X_test)
# compute probabilistic predictions
# p_test = model.predict_proba(X_test)

In [199]:
y_pred1 = []
y_pred2 = []
for y in p_test:
    if y[0] >= y[1]:
        y_pred1.append(1)
        y_pred2.append(0)
    else:
        y_pred1.append(0)
        y_pred2.append(1)
# p_train = p_train[:,1:2]

# y_train.shape

In [119]:
print p_test

[[ 0.6849748   0.3150252 ]
 [ 0.52567042  0.47432958]
 [ 0.90119837  0.09880163]
 ..., 
 [ 0.52983902  0.47016098]
 [ 0.56836426  0.43163574]
 [ 0.58830821  0.41169179]]


In [115]:
print Y_test

[614689, 739581, 3638, 613, 1352348, 1010, 5922518, 580544, 364, 541611, 657021, 541611, 756715, 501043, 48711, 242529, 7360339, 71597, 298668, 185616, 16637, 2696, 146298, 54453, 3106, 627, 627, 96531, 44, 1682535, 676, 508213, 245591, 3867, 143101, 154830, 489, 2774790, 9450, 794, 1611042, 725, 9450, 186557, 444, 703, 366460, 4189, 628, 163145, 3914, 1367, 9679, 9450, 5751, 16978, 103607, 221001, 5501, 78083, 6119, 2470, 4557, 794, 1205166, 592, 638283, 20, 1611042, 119001, 102116, 697, 462071, 254, 245591, 739581, 6375552, 44034, 3892, 38, 377, 364, 119001, 7147, 3742349, 756715, 504004, 4760, 103607, 513424, 378, 1294, 1352348, 2256591, 3832, 3166413, 3302480, 2745543, 823481, 1097135, 85138, 266407, 5996, 598, 119001, 5685, 565, 322682, 3893, 103607, 11147, 108768, 570772, 3832, 242529, 206818, 2260661, 1682535, 1641075, 11018, 501043, 1607541, 209, 1720, 480412, 66689, 130850, 44, 1043, 100836, 245591, 78731, 2759731, 221001, 1043, 3081505, 1404323, 3914, 13373, 3166413, 138235, 

In [124]:
print p_test[:,0]

[ 0.6849748   0.52567042  0.90119837 ...,  0.52983902  0.56836426
  0.58830821]


In [138]:
np.savetxt("foo.csv", p_test[:,0], delimiter=",")

In [139]:
arr = np.arange(1, len(p_test[:,0]))

In [140]:
print arr

[   1    2    3 ..., 5949 5950 5951]


In [142]:
predfile = open('predictions.csv','w+')
header = ['Id', 'Choice']
print >>predfile,','.join(header)
for line in np.concatenate((arr, p_test[:,0]),axis=0):
    print >>predfile, ','.join([str(item) for item in line])

predfile.close()

TypeError: 'numpy.float64' object is not iterable

In [143]:
arr2 = np.concatenate(arr, p_test[:,0])

TypeError: only integer scalar arrays can be converted to a scalar index

In [144]:
print p_test[:,0]

[ 0.6849748   0.52567042  0.90119837 ...,  0.52983902  0.56836426
  0.58830821]


In [145]:
pt = p_test[:,0]

In [150]:
arr2 = np.concatenate((arr, pt), axis = 1)

IndexError: axis 1 out of bounds [0, 1)

In [149]:
print arr2

[ 1.          2.          3.         ...,  0.52983902  0.56836426
  0.58830821]


In [152]:
np.vstack((arr,pt))

ValueError: all the input array dimensions except for the concatenation axis must match exactly

In [153]:
arr.shape

(5951,)

In [154]:
pt.shape

(5952,)

In [200]:
arr = np.arange(1, len(p_test[:,0]) + 1)

In [201]:
arr.shape


(5952,)

In [160]:
np.vstack((arr,pt))

array([[  1.00000000e+00,   2.00000000e+00,   3.00000000e+00, ...,
          5.95000000e+03,   5.95100000e+03,   5.95200000e+03],
       [  6.84974797e-01,   5.25670423e-01,   9.01198365e-01, ...,
          5.29839020e-01,   5.68364264e-01,   5.88308212e-01]])

In [392]:
line = 'Id,Choice'

In [393]:
f1 = open('foo2.csv', 'w')

In [394]:
f1.write(line)

In [395]:
import os

In [396]:
f1.write(os.linesep)

In [207]:
i = 1
for p in pt:
    f1.write(str(i) + ',' + str(p) + os.linesep)
    i+=1
f1.close()

In [355]:
print len(X_test)

5952


In [397]:
y_pred = np.zeros(len(X_test))
for i in range(len(X_test)):
    y_pred[i] = expit(X_test[i,].dot(W))


In [398]:
i = 1
for p in y_pred:
    f1.write(str(i) + ',' + str(p) + os.linesep)
    i+=1
f1.close()

In [382]:
p_test = model.predict_proba(X_test)

In [383]:
p_test

array([[ 0.60056816,  0.39943184],
       [ 0.98891693,  0.01108307],
       [ 0.46441639,  0.53558361],
       [ 0.18954553,  0.81045447],
       [ 0.27975079,  0.72024921],
       [ 0.19727837,  0.80272163],
       [ 0.53333387,  0.46666613],
       [ 0.91673623,  0.08326377],
       [ 0.06295771,  0.93704229],
       [ 0.55996917,  0.44003083],
       [ 0.96378455,  0.03621545],
       [ 0.91186234,  0.08813766],
       [ 0.1232466 ,  0.8767534 ],
       [ 0.97472168,  0.02527832],
       [ 0.98679017,  0.01320983],
       [ 0.36887684,  0.63112316],
       [ 0.03754036,  0.96245964],
       [ 0.89724061,  0.10275939],
       [ 0.39987082,  0.60012918],
       [ 0.6840366 ,  0.3159634 ],
       [ 0.04537507,  0.95462493],
       [ 0.26755523,  0.73244477],
       [ 0.16954777,  0.83045223],
       [ 0.97825785,  0.02174215],
       [ 0.23296044,  0.76703956],
       [ 0.24148151,  0.75851849],
       [ 0.22460834,  0.77539166],
       [ 0.13910695,  0.86089305],
       [ 0.61349244,

In [400]:
y_pred1 = []
y_pred2 = []
for y in y_pred:
    if y >= 0.5:
        y_pred1.append(1)
        y_pred2.append(-1)
    else:
        y_pred1.append(-1)
        y_pred2.append(1)
# p_train = p_train[:,1:2]

# y_train.shape

In [404]:
nCount = 0
for i in range(len(y_test)):
    if (y_pred1[i] == y_test[i]):
        nCount += 1

In [405]:
print nCount

403


In [406]:
print len(y_test)

500


In [588]:
testRec = open('sample_predictions.csv')

In [589]:
for line in testRec:
    head = line.rstrip().split(',')
    break

In [590]:
print head

['Id', 'Choice']


In [591]:
yPred = []
for line in testRec:
    splitted = line.rstrip().split(',')
    val = float(splitted[1])
    print val
    if val >= 0.5:
        yPred.append(1)
    else:
        yPred.append(-1)

0.315025203
0.474329577
0.098801635
0.137932293
0.523677468
0.255949057
0.659494016
0.97994645
0.021005756
0.676974978
0.318313857
0.844445805
0.647216887
0.41657478
0.92952245
0.777816672
0.990191596
0.244764782
0.881047422
0.325483587
0.704887318
0.088215603
0.901005068
0.824401205
0.098874503
0.060453542
0.106568189
0.25996975
0.161303584
0.782041577
0.04153939
0.948324279
0.580100646
0.914081388
0.570560458
0.484581635
0.355313057
0.312683816
0.770956859
0.383298599
0.782155187
0.092388482
0.107835269
0.974358246
0.079098252
0.76619514
0.697668183
0.077460307
0.072112535
0.499693218
0.213130776
0.066122191
0.102236462
0.525900079
0.967639121
0.286637773
0.29582443
0.500683236
0.126242498
0.97896395
0.245721742
0.757027469
0.123598879
0.347595611
0.358299118
0.045798573
0.302041871
0.17765318
0.667914156
0.546186034
0.868511188
0.371125015
0.278314812
0.383495816
0.487140712
0.81468858
0.624178188
0.629058322
0.054003438
0.00884603
0.40032856
0.027221636
0.136743724
0.617692813
0.36

In [478]:
print expit(X_test[1, ].dot(W))
print len(yPred)
print len(X_test)

0.465920689029
5952
5952


In [479]:
print yPred

[-1, -1, -1, -1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, 1, -1, 1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, -1, -1, -1, 1, -1, 1, -1, -1, 1, -1, 1, 1, -1, -1, -1, -1, -1, -1, 1, 1, -1, -1, 1, -1, 1, -1, 1, -1, -1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, -1, 1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, 1, -1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, -1, 1, 1, 1, 1, -1, -1, 1, -1, -1, -1, 1, -1, 1, 1, -1, 1, -1, 1, 1, -1, 1, -1, -1, -1, -1, -1, 1, -1, -1, 1, 1, -1, -1, 1, 1, 1, -1, -1, -1, 1, 1, -1, 1, 1, 1, -1, -1, -1, -1, -1, 1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, -1, 1, -1, -1, 1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, 1, -1, -1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 1, 1, 1, -1, 1, 1, -1, -1, -1, -1, 1, -1, 1, -1, -1, -1, -1, 1, 1, 1, -1, 1, 1, 1, 1, 1, 1, 1, -1, 1, -1, 1, 1, -1, 1, 1, -1, -1, -1, -1, -1, -1, -1, 1, -1, 1, 1, 1, 1, 1, 1, -1, 1, 1, -1, 1, -1, -1, 1, -1, 1, 1, 1, -1, -1, 1, 1, 1, 1, 1, -1, 

In [486]:
nCorrect = 0
nIncorrect = 0
# X_test
for i in range(len(yPred)):
    y_hat = expit(X_test[i,].dot(W))
    
    if y_hat >= 0.5:
        y_hat = 1
    else :
        y_hat = -1
            
    if y_hat == yPred[i]:
        nCorrect += 1
    else:
        nIncorrect += 1
print (nCorrect * 1.0 /(nCorrect + nIncorrect))

0.986055107527


In [734]:
def kFoldCrossValidation(k, lmda, learningRate, maxIter = 100):
    trainfile = open('train.csv')
    for line in trainfile:
        head = line.rstrip().split(',')
        break
    X_train_A = []
    X_train_B = []
    y_train = []
    for line in trainfile:
        splitted = line.rstrip().split(',')
        label = int(splitted[0])
        A_features = [float(item) for item in splitted[1:12]]
        B_features = [float(item) for item in splitted[12:]]
        y_train.append(label)
        X_train_A.append(A_features)
        X_train_B.append(B_features)
    trainfile.close()
    Y_train = np.array(y_train)
    X_train_A = np.array(X_train_A)
    X_train_B = np.array(X_train_B)
    
    for i in range(len(y_train)):
        if y_train[i] == 0:
            y_train[i] = -1
            
    X_train = transform_features(X_train_A) - transform_features(X_train_B)
    np.random.seed(1)      #Seed the random number generator to preserve the dev/test split
    permutation = np.random.permutation(X_train.shape[0])
    X_train = X_train[permutation,]
    Y_train = Y_train[permutation,]
    X_train = normalize(X_train)
    
#     dt = DecisionTreeClassifier(splitter='best', criterion='gini', max_depth=4)
#     dt.fit(X_train, y_train)
    
    
    testfile = open('test.csv')
    #ignore the test header
    testfile.next()

    X_test_A = []
    X_test_B = []
    Y_test = []
    for line in testfile:
        splitted = line.rstrip().split(',')
        label = int(splitted[0])
        A_features = [float(item) for item in splitted[0:11]]
        B_features = [float(item) for item in splitted[11:]]
        Y_test.append(label)
        X_test_A.append(A_features)
        X_test_B.append(B_features)
    testfile.close()

    X_test_A = np.array(X_test_A)
    X_test_B = np.array(X_test_B)

# # transform features in the same way as for training to ensure consistency
    X_test = transform_features(X_test_A) - transform_features(X_test_B)
    X_test = normalize(X_test)
    
    testRec = open('sample_predictions.csv')
    Y_test = []
    for line in testRec:
        head = line.rstrip().split(',')
        break
        
    for line in testRec:
        splitted = line.rstrip().split(',')
        val = float(splitted[1])
        if val >= 0.5:
            Y_test.append(1)
        else:
            Y_test.append(-1)
    
#     yProb = dt.predict_proba(X_test)
    
#     print dt.max_depth
    
#     f1 = open('foo3.csv', 'w')
#     line = 'Id,Choice'
#     f1.write(line + os.linesep)
#     i = 1
#     for p in yProb:
#         f1.write(str(i) + ',' + str(p[0]) + os.linesep)
#         i+=1
#     f1.close()
    
#     yPredictions = dt.predict(X_test)
# # compute probabilistic predictions
#     nCorrect = 0
#     nIncorrect = 0
#     for i in range(len(yPredictions)):
#         if yPredictions[i] == yPred[i]:
#             nCorrect += 1
#         else:
#             nIncorrect += 1
#     print nCorrect*1.0/(nCorrect + nIncorrect)
            
    splitPoint = (X_train.shape[0] - int(X_train.shape[0]/k))
    acc = np.zeros(k)
    for i in range(k):
        indices = np.random.permutation(X_train.shape[0])
#         training_idx, test_idx = indices[:splitPoint], indices[splitPoint:]
#         X_t = X_train[training_idx,:]
#         X_test = X_train[test_idx, :]
#         Y_t = y_train[training_idx,]
#         Y_test = y_train[test_idx,]
        acc[i] = runExperiments(X_train, Y_train, X_test, Y_test, lmda, learningRate, maxIter)
    
    print "Average Accuracy : ", np.mean(acc)

In [733]:
kFoldCrossValidation(k = 10, lmda=.001, learningRate=0.01, maxIter=10)

Average Accuracy :  0.925739247312


In [763]:
def FeatureSelection():
    trainfile = open('train.csv')
    for line in trainfile:
        head = line.rstrip().split(',')
        break
    X_train_A = []
    X_train_B = []
    y_train = []
    for line in trainfile:
        splitted = line.rstrip().split(',')
        label = int(splitted[0])
        A_features = [float(item) for item in splitted[1:12]]
        B_features = [float(item) for item in splitted[12:]]
        y_train.append(label)
        X_train_A.append(A_features)
        X_train_B.append(B_features)
    trainfile.close()
    y_train = np.array(y_train)
    X_train_A = np.array(X_train_A)
    X_train_B = np.array(X_train_B)
    
    for i in range(len(y_train)):
        if y_train[i] == 0:
            y_train[i] = -1
            
    X_train = transform_features(X_train_A) - transform_features(X_train_B)
    np.random.seed(1)      #Seed the random number generator to preserve the dev/test split
    permutation = np.random.permutation(X_train.shape[0])
    X_train = X_train[permutation,]
    Y_train = y_train[permutation,]
    X_train = normalize(X_train)
    
    
    testfile = open('test.csv')
    #ignore the test header
    testfile.next()

    X_test_A = []
    X_test_B = []
    Y_test = []
    for line in testfile:
        splitted = line.rstrip().split(',')
        label = int(splitted[0])
        A_features = [float(item) for item in splitted[0:11]]
        B_features = [float(item) for item in splitted[11:]]
        X_test_A.append(A_features)
        X_test_B.append(B_features)
    testfile.close()
    
    testRec = open('sample_predictions.csv')
    for line in testRec:
        head = line.rstrip().split(',')
        break
        
    for line in testRec:
        splitted = line.rstrip().split(',')
        val = float(splitted[1])
        if val >= 0.5:
            Y_test.append(1)
        else:
            Y_test.append(-1)
    
    X_test_A = np.array(X_test_A)
    X_test_B = np.array(X_test_B)
# transform features in the same way as for training to ensure consistency
    X_test = transform_features(X_test_A) - transform_features(X_test_B)
    X_test = normalize(X_test)
    
    
    ss = set()
    fs = [x for x in range(11)]
    fs = set(fs)
    bestAccuracy = 0.0
    while (True):
        bestFeature = None
        for feature in fs:
            if feature not in ss:
                ssPrime = ss.copy()
                ssPrime.add(feature)
                print ssPrime
#             accu = runExperiments(X_train,Y_train, X_test, Y_test, lmda = .001, learningRate = .01, maxIter = 10)
                accu = getAccuracy(X_train, Y_train, ssPrime, X_train, Y_train, .001, 0.01, 10)
                print bestAccuracy, accu
                if accu > bestAccuracy:
                    bestFeature = feature
                    bestAccuracy = accu
        if bestFeature is not None:
            bestSet = set()
            bestSet.add(bestFeature)
            ss |= bestSet
        if bestFeature == None or len(ss) == 11:
            break
    print ss

In [764]:
def getAccuracy(X_train, y_train, featureSet, X_test, Y_test, lmda, learningRate, maxIter):
    featIdx = list(featureSet)
    X_t = X_train[:,featIdx]
    X_test = X_test[:,featIdx]
    return runExperiments(X_t, y_train, X_test, Y_test, lmda, learningRate, maxIter)

In [765]:
FeatureSelection()

set([0])
0.0 0.753090909091
set([1])
0.753090909091 0.573272727273
set([2])
0.753090909091 0.76
set([3])
0.76 0.749636363636
set([4])
0.76 0.742363636364
set([5])
0.76 0.642363636364
set([6])
0.76 0.606727272727
set([7])
0.76 0.643636363636
set([8])
0.76 0.753272727273
set([9])
0.76 0.568
set([10])
0.76 0.598181818182
set([0, 2])
0.76 0.760363636364
set([1, 2])
0.760363636364 0.759818181818
set([2, 3])
0.760363636364 0.761454545455
set([2, 4])
0.761454545455 0.761090909091
set([2, 5])
0.761454545455 0.764727272727
set([2, 6])
0.764727272727 0.762545454545
set([2, 7])
0.764727272727 0.762545454545
set([8, 2])
0.764727272727 0.764727272727
set([9, 2])
0.764727272727 0.760909090909
set([2, 10])
0.764727272727 0.762909090909
set([0, 2, 5])
0.764727272727 0.762363636364
set([1, 2, 5])
0.764727272727 0.764727272727
set([2, 3, 5])
0.764727272727 0.761454545455
set([2, 4, 5])
0.764727272727 0.762
set([2, 5, 6])
0.764727272727 0.761818181818
set([2, 5, 7])
0.764727272727 0.761090909091
set([8, 

In [4]:
from sklearn.cluster import KMeans

In [24]:
trainfile = open('train.csv')
for line in trainfile:
    head = line.rstrip().split(',')
    break
X_train_A = []
X_train_B = []
y_train = []
for line in trainfile:
    splitted = line.rstrip().split(',')
    label = int(splitted[0])
    A_features = [float(item) for item in splitted[1:12]]
    B_features = [float(item) for item in splitted[12:]]
    y_train.append(label)
    X_train_A.append(A_features)
    X_train_B.append(B_features)
trainfile.close()
y_train = np.array(y_train)
X_train_A = np.array(X_train_A)
X_train_B = np.array(X_train_B)
    
for i in range(len(y_train)):
    if y_train[i] == 0:
        y_train[i] = -1
            
X_train = transform_features(X_train_A) - transform_features(X_train_B)
np.random.seed(1)      #Seed the random number generator to preserve the dev/test split
permutation = np.random.permutation(X_train.shape[0])
X_train = X_train[permutation,]
Y_train = y_train[permutation,]
X_train = normalize(X_train)
X_train_com1 = X_train[:,[0]]

In [12]:
kmeans = KMeans(n_clusters=8, init = 'k-means++', n_init=15, max_iter=1000, random_state=0).fit(X_train_com1)

In [13]:
kmeans.labels_

array([6, 0, 3, ..., 5, 6, 1], dtype=int32)

In [14]:
print len(kmeans.labels_)

5500


In [5]:
def transformIntoClusters(X_train, clusters = 8):
    for i in range(X_train.shape[1]):
        colToTransform = X_train[:,[i]]
        kmeans = KMeans(n_clusters=clusters, init = 'k-means++', n_init=15, max_iter=5000, random_state=0)
        model = kmeans.fit(colToTransform)
        X_train[:,i] = model.labels_
    return X_train

In [44]:
def TrainNaiveBayes():
    trainfile = open('train.csv')
    for line in trainfile:
        head = line.rstrip().split(',')
        break
    X_train_A = []
    X_train_B = []
    y_train = []
    X_train = []
    for line in trainfile:
        splitted = line.rstrip().split(',')
        label = int(splitted[0])
        A_features = [float(item) for item in splitted[1:]]
        B_features = [float(item) for item in splitted[12:]]
        y_train.append(label)
        X_train_A.append(A_features)
        X_train_B.append(B_features)
        X_train.append(A_features)
    trainfile.close()
    Y_train = np.array(y_train)
    X_train_A = np.array(X_train_A)
    X_train_B = np.array(X_train_B)
    X_train = np.array(X_train)
    
    
#     print X_train.shape[1]
    
#     prob0 = 0.0
#     zeroCount = 0
#     for i in range(len(Y_train)):
#         if y_train[i] == 0:
#             y_train[i] = -1
# #             zeroCount += 1
    
#     prob0 = zeroCount * 1.0 / (len(y_train))
#     prob1 = 1 - prob0
    
#     Prior = [prob0, prob1]
    
#     X_train = transform_features(X_train)
    np.random.seed(1)      #Seed the random number generator to preserve the dev/test split
    permutation = np.random.permutation(X_train.shape[0])
    X_train = X_train[permutation,]
    Y_train = Y_train[permutation,]
    X_train = normalize(X_train)
    X_train = transformIntoClusters(X_train, clusters=7)
    X_train = normalize(X_train)
#     Probab = {'1' : {}, '-1' : {}}
    
#     for i in range(X_train.shape[0]):
#         for j in range(X_train.shape[1]):
#             cluster = X_train[i,j]
#             if y_train[i] == 1:
#                 if j not in Probab['1']:
#                     Probab['1'][j] = {}
#                     if cluster not in Probab['1'][j]:
#                         Probab['1'][j][cluster] = 1.0
#                 else:
#                     if cluster not in Probab['1'][j]:
#                         Probab['1'][j][cluster] = 1.0
#                     else:
#                         Probab['1'][j][cluster] += 1.0
#             else :
#                 if j not in Probab['-1']:
#                     Probab['-1'][j] = {}
#                     if cluster not in Probab['-1'][j]:
#                         Probab['-1'][j][cluster] = 1.0
#                 else:
#                     if cluster not in Probab['-1'][j]:
#                         Probab['-1'][j][cluster] = 1.0
#                     else:
#                         Probab['-1'][j][cluster] += 1.0
#     for j in range(X_train.shape[1]):
#         sum1 = sum(Probab['1'][j].values())
#         sum0 = sum(Probab['-1'][j].values())
#         for key in Probab['1'][j]:
#             Probab['1'][j][key] /= sum1
#         for key in Probab['-1'][j]:
#             Probab['-1'][j][key] /= sum0
          
    return Y_train, X_train

In [45]:
Y_train, X_train = TrainNaiveBayes()

In [46]:
def PredictBayes(x, Probab, Prior):
    Prob0 = Prior[0]
    Prob1 = Prior[1]
    for i in range(len(x)):
        cluster = x[i]
        Prob0 *= Probab['-1'][i][cluster]
        Prob1 *= Probab['1'][i][cluster]
    
    if Prob0 >= Prob1:
        return -1
    return 1

In [158]:
YHat = []
for i in range(len(X_train)):
    YHat.append(PredictBayes(X_train[i,], Probab, Prior))

In [159]:
nCorrect = 0
nIncorrect = 0
for i in range(len(Y_train)):
    if YHat[i] == Y_train[i]:
        nCorrect += 1
    else:
        nIncorrect += 1

print nCorrect * 1.0 / (nCorrect + nIncorrect)

0.475272727273


In [47]:
from sklearn.naive_bayes import GaussianNB

In [48]:
clf = GaussianNB()

In [49]:
clf.fit(X_train, Y_train)

GaussianNB()

In [50]:
yHat = clf.predict(X_train)

print yHat

[1 0 1 ..., 1 0 1]


In [51]:
nCorrect = 0
nIncorrect = 0
for i in range(len(Y_train)):
    if yHat[i] == Y_train[i]:
        nCorrect += 1
    else:
        nIncorrect += 1

print nCorrect * 1.0 / (nCorrect + nIncorrect)

0.718
