In [1]:
import pandas as pd
import numpy as np
import sys
import getopt
import os
import random
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
from sklearn import feature_extraction
from joblib import Parallel, delayed


In [2]:
def load_train_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    np.random.seed(seed=2015)
    np.random.shuffle(X)
    X, labels, ids = X[:, 1:-1].astype(np.float32), X[:, -1], X[:, 0].astype(str)
    binarizer = LabelBinarizer()
    y = binarizer.fit_transform(labels)
    encoder = LabelEncoder()
    y_coded = encoder.fit_transform(labels).astype(np.int32)
    return X, y, y_coded, ids
    
def load_test_data(path):
    df = pd.read_csv(path)
    X = df.values.copy()
    X, ids = X[:, 1:].astype(np.float32), X[:, 0].astype(str)
    return X, ids

def transform_data(X, X_test):
    X_all = np.vstack((X,X_test))
    X_all = 2.*np.sqrt(X_all + (3./8.))
    return X_all[0:X.shape[0],:], X_all[X.shape[0]:,:]

def train_class(X, y, X_test, i):
    index_shuffle = [j for j in range(X.shape[0])]
    random.shuffle(index_shuffle)
    yi = [t[i] for t in y[index_shuffle]]
            
    clf=KNeighborsClassifier(n_neighbors=par[1], weights='distance', p = par[0])
    clf.fit(X[index_shuffle,:],yi)
    preds_class = pd.DataFrame()
    preds_class['Class_'+str(i+1)+'_knn8'] = clf.predict_proba(X_test)[:,1]

    return preds_class

In [None]:
train_file = './data/train.csv'
test_file = './data/test.csv'
pred_file = './data/sampleSubmission.csv'
cv = 0
nfolds = 10
target_col = 'target'

if cv == 0: 
    nfolds = 2

X, y, y_coded, ids_train = load_train_data(train_file)
X_test, ids_test = load_test_data(test_file)
X, X_test = transform_data(X, X_test)
num_classes = len(y[0])
num_features = X.shape[1]
skf = StratifiedKFold(y_coded, nfolds, random_state=2015)
ids_train_folds = np.empty(0)
for train_index, valid_index in skf:
    ids_train_folds = np.append(ids_train_folds, ids_train[valid_index])

#train = train.reindex(np.random.permutation(train.index))
pars = [[2,5], [2,20], [2,40], [2,60], [2,80], [2,100], [1,5], [1,20], [1,50], [1,100], [1,70]]
epoch = len(pars)

for e in range(epoch):
    print "processing iteration", e
    #seed = random.randint(10, 1000000) + e
    seed = 1105 + 20*e
    par = pars[e]

    if cv == 0:
        for i in range(num_classes):
            preds_class = train_class(X, y, X_test, i)
            if i == 0:
                preds_epoch = preds_class.copy()
            else:
                preds_epoch = pd.concat([preds_epoch, preds_class], axis = 1)
        # parallel version
        #list_result = Parallel(n_jobs=6)(delayed(train_class)(X, y, X_test, i) for i in range(num_classes))
        #preds_epoch = pd.concat(list_result, axis = 1)
    else:
        count = 0
        for train_index, valid_index in skf:
            print "processing fold", count+1
            X_train, X_valid = X[train_index], X[valid_index]
            y_train, y_valid = y[train_index], y[valid_index]
            if count == 0:
                actual = y_valid
            else:
                actual = np.append(actual, y_valid, axis=0)

            for i in range(num_classes):
                preds_class = train_class(X_train, y_train, X_valid, i)
                if i == 0:
                    preds_fold = preds_class.copy()
                else:
                    preds_fold = pd.concat([preds_fold, preds_class], axis = 1)
            
            # parallel version
            #list_result = Parallel(n_jobs=6)(delayed(train_class)(X_train, y_train, X_valid, i) for i in range(num_classes))
            #preds_fold = pd.concat(list_result, axis = 1)
              
            if count == 0:
                preds_epoch = preds_fold.copy()
            else:
                preds_epoch = preds_epoch.append(preds_fold, ignore_index=True)

            count += 1
            print "logloss", log_loss(actual, preds_epoch.as_matrix())
    if cv == 0:
        preds_epoch['id'] = ids_test.astype(float).astype(int)
        preds_epoch.to_csv('./data/output-knn/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
        preds_epoch = preds_epoch.drop('id', axis=1)
    else:
        preds_epoch['id'] = ids_train_folds.astype(float).astype(int)
        preds_epoch.to_csv('./data/output-knn/' + os.path.splitext(pred_file)[0] + '.epoch' + str(e) + '.csv', index=False)
        preds_epoch = preds_epoch.drop('id', axis=1)
    
    if e == 0:
        preds = preds_epoch.copy()
    else:
        preds = preds.add(preds_epoch, fill_value=0)
    if cv == 1:
        preds_epoch = preds.copy()
        preds_epoch = preds_epoch.divide(e+1)
        print "final logloss", log_loss(actual, preds_epoch.as_matrix())

        
# create submission file
preds = preds.divide(epoch)
if cv == 0:
    preds['id'] = ids_test.astype(float).astype(int)
    preds.to_csv(os.path.splitext(pred_file)[0] + '.csv', index=False)
else:
    preds['id'] = ids_train_folds.astype(float).astype(int)
    preds.to_csv(os.path.splitext(pred_file)[0] + '.csv', index=False)

processing iteration 0
