In [1]:
import pandas as pd
import preprocess
import regression
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer

In [2]:
test_data = pd.read_csv('./data/test set.csv')
train_data = pd.read_csv('./data/trainSet.csv')

In [3]:
# list(test_data), len(test_data)

In [4]:
# list(train_data), len(train_data)

In [5]:
train_x = train_data.values[:, :-1]
train_target = train_data.values[:, -1].reshape(-1, 1)

In [6]:
z_scale_processed_x, mean, std = preprocess.z_score_scaling(train_x)
min_max_scaled_x = preprocess.min_max_scaling(train_x)
processed_test, _, _ = preprocess.z_score_scaling(test_data.values, mean, std)


In [7]:
# Feature Discretization with kmeans

In [8]:
# trainer is used for training and should return the trained parameters,
# tester should receive x and the parameters and return the predicted item
# args and kwargs are extra parameters to pass to the trainer.
def k_fold(k, trainer, tester, x, target, *args, **kwargs):
    samples, features = x.shape
    fold_size = int(samples / k)
    shuffled = np.hstack((x, target))
    np.random.shuffle(shuffled)
    shuffled_x = shuffled[:, :-1]
    shuffled_target = shuffled[:, -1].reshape(-1, 1)
    x_sets = np.array([shuffled_x[i * fold_size : (i+1) * fold_size] for i in range(k)])
    target_sets = np.array([shuffled_target[i * fold_size: (i+1) * fold_size] for i in range(k)])
    precisions_sum = 0
    for i in range(k):
        training_set = np.empty(shape=(0, features))
        training_targets = np.empty(shape=(0, 1))
        test_set = x_sets[i]
        test_targets = target_sets[i]
        for j in range(k):
            if j != i:
                training_set = np.concatenate((training_set, x_sets[j]))
                training_targets = np.concatenate((training_targets, target_sets[j]))
        the = trainer(training_set, training_targets, *args, **kwargs)
        predicted = tester(test_set, the)
        predicted = [1 if e > 0.5 else 0 for e in predicted]
        corrects = 0
        for j in range(fold_size):
            if predicted[j] == test_targets[j]:
                corrects += 1
        print('%f at k=%d' % (corrects / fold_size, i))
        precisions_sum += corrects / fold_size
    result = precisions_sum / k
    print('Average: %f' % result)
    return result

In [21]:
iters = 500
lr = 0.2
k_fold(5, regression.regression, regression.logistic_h, z_scale_processed_x, train_target, 
       regression.logistic_h, iters, lr, lbd=5.0, log=True, logInterval=125)

2.5873417287275706
0.6643015257513767
0.6609301771769782
0.6602388876789526
0.605500 at k=0
3.397101039052818
0.6683400645042892
0.6618265853731926
0.6605565731127802
0.609200 at k=1
2.7235534411831623
0.6645272647228058
0.659916300554506
0.6593062984421857
0.603125 at k=2
2.655693783682454
0.6644451475640342
0.6605751955084778
0.6598511296136765
0.606875 at k=3
2.85293289488634
0.6655182515166114
0.6609652540625447
0.6604146665523453
0.610550 at k=4
Average: 0.607050


0.60705

In [22]:
def shuffle(x, label):
    shuffled = np.hstack((x, label))
    np.random.shuffle(shuffled)
    shuffled_x = shuffled[:, :-1]
    shuffled_target = shuffled[:, -1].reshape(-1, 1)
    return shuffled_x, shuffled_target

In [23]:
iters = 500
lr = 0.2
shuffled_x, shuffled_label = shuffle(z_scale_processed_x, train_target)
the = regression.regression(z_scale_processed_x, train_target, regression.logistic_h, iters, lr, lbd=5.0, log=True)

2.479528125968463
0.6677671836011526
0.660634653431319
0.6597069461473278
0.6595142755955541


In [24]:
result = regression.logistic_h(z_scale_processed_x, the)

In [25]:
result = [1 if e > 0.5 else 0 for e in result]

In [27]:
accurate = 0
for i in range(len(result)):
    if result[i] == train_target[i]:
        accurate += 1
accurate

121644

In [32]:
result = regression.logistic_h(processed_test, the)
result = [1 if e > 0.5 else 0 for e in result]
print(np.sum(result))

139725


In [None]:
pd.DataFrame([[i+1, 1 if result[i] > 0.5 else 0] for i in range(len(result))]).to_csv('result.csv', header=['Id', 'Predicted'], index=False)

In [16]:
import torch
from torch import functional as F

In [70]:
class Net(torch.nn.Module):
    def __init__(self, features, hidden1, hidden2, out):
        super(Net, self).__init__()
        self.layer1 = torch.nn.Linear(features, hidden1)
        self.layer2 = torch.nn.Linear(hidden1, hidden2)
        self.layer3 = torch.nn.Linear(hidden2, out)
        
    def forward(self, x):
        o1 = self.layer1(x)
        o2 = self.layer2(o1)
        return self.layer3(o2)
    

In [71]:
net = Net(z_scale_processed_x.shape[1], 260, 127, 2)
net

Net(
  (layer1): Linear(in_features=32, out_features=260, bias=True)
  (layer2): Linear(in_features=260, out_features=127, bias=True)
  (layer3): Linear(in_features=127, out_features=2, bias=True)
)

In [72]:
optimizer = torch.optim.Adam(net.parameters())
loss_func = torch.nn.CrossEntropyLoss()

In [73]:
epoch = 20
def trainer(x, target):
    for i in range(epoch):
        o = net(torch.Tensor(x))
        loss = loss_func(o, torch.Tensor(target).long().squeeze(1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

def tester(x, *args):
    o = net(torch.Tensor(x))
    return torch.max(o, 1)[1].numpy().reshape(-1, 1)

In [74]:
k_fold(5, trainer, tester, z_scale_processed_x, train_target)

0.607600 at k=0
0.606700 at k=1
0.606775 at k=2
0.606200 at k=3
0.607800 at k=4
Average: 0.607015


0.607015

In [116]:
te = train_x[:, 2]
kb = KBinsDiscretizer(n_bins=10, encode='onehot-dense', strategy='kmeans')
kb.fit(te.reshape(-1, 1))

KBinsDiscretizer(encode='onehot-dense', n_bins=10, strategy='kmeans')

In [16]:
samples, features = train_x.shape
discretized = np.empty(shape=(samples, 0))
discretizer = []
for i in range(features):
    kb = KBinsDiscretizer(n_bins=20, encode='onehot-dense', strategy='uniform')
    col = train_x[:, i].reshape(-1, 1)
    kb.fit(col)
    discretizer.append(kb)
    print(i)
    discretized = np.hstack((discretized, kb.transform(col)))

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31


In [17]:
discretized.shape

(200000, 640)

In [18]:
iters = 200
lr = 0.5
k_fold(5, regression.regression, regression.logistic_h, discretized, train_target, 
       regression.logistic_h, iters, lr, lbd=5.0, log=True, logInterval=50)

5.164201496857981
0.8493710190342197
0.750037518652131
0.7174333294845987
0.586625 at k=0
7.036856801796076
0.8543927005530155
0.7598158076018899
0.7249947703874418
0.580575 at k=1
6.7882568962446195
0.8268020560109115
0.7555480123263164
0.7236738801810155
0.586275 at k=2
7.013016941453348
0.8625226108011852
0.7483188563307671
0.7133159906789235
0.592300 at k=3
6.537905596745202
0.8403178038529427
0.7469365415608864
0.7125058931226065
0.582900 at k=4
Average: 0.585735


0.5857349999999999

In [40]:
import sklearn
import sklearn.ensemble
import sklearn.svm
import sklearn.linear_model

In [48]:
def sk_train(x, y):
    classifier = sklearn.ensemble.AdaBoostClassifier(base_estimator=sklearn.svm.SVC(probability=True,kernel='linear'))
    classifier.fit(x, y.ravel())
    return classifier

def sk_test(x, classifier):
    return classifier.predict(x).reshape(-1, 1)

In [None]:
k_fold(5, sk_train, sk_test, z_scale_processed_x, train_target)

In [41]:
classifier = sk_train(z_scale_processed_x, train_target)

In [42]:
res = classifier.predict(z_scale_processed_x)
accurate = 0
for i in range(len(res)):
    if res[i] == train_target[i]:
        accurate += 1
accurate / len(res)

0.98812

In [43]:
res = classifier.predict(processed_test)
res.sum()

111613.0

In [32]:
pd.DataFrame([[i+1, int(res[i])] for i in range(len(res))]).to_csv('forest.csv', header=['Id', 'Predicted'], index=False)