In [142]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder as OHE
from sklearn.preprocessing import LabelEncoder as LE
from sklearn import cross_validation as cv
from sklearn import metrics
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier as KNC
from sklearn.linear_model import LogisticRegression as LR
from scipy import sparse
from itertools import combinations
import sys

In [67]:
def get_data():
    X = pd.read_csv("../dataset/train.csv")
    X_test = pd.read_csv("../dataset/test.csv")
    return X, X_test

In [68]:
def get_target(X, X_test):
    y = X.ACTION
    del X['ACTION']
    IDs = X_test.id
    del X_test['id']
    return y, IDs

In [13]:
sample = pd.read_csv("../dataset/sampleSubmission.csv")

# Data Summarization/Visualization

In [14]:
def count_ones(X):
    """
    Counts the number of 1-count categories in each feature
    """
    value_counts = dict()
    for c in all_data.columns:
        value_counts[c] = pd.Series(all_data[c].value_counts())
    ones = list()
    for c in X.columns:
        for i, v in value_counts[c].iteritems():
            if v == 1: ones.append(i)               
    return ones

# Data Preprocessing

In [85]:
from sklearn.preprocessing import OneHotEncoder

def one_hot(all_data):
#    all_data = pd.get_dummies(pd.concat([X, X_test], ignore_index=True))
#    all_data = pd.concat([X, X_test], ignore_index=True)
    enc = OneHotEncoder()
    final_data = pd.DataFrame(enc.fit_transform(all_data))
    return final_data

In [125]:
def one_hot_separate(data):
    keys = []
    for c in data.T:
        uniques = set(list(c))
        keys.append(dict((key, i) for i, key in enumerate(uniques)))
    total = data.shape[0]
    hot_data = []
    for i, c in enumerate(data.T):
        kmap = keys[i]
        num_labels = len(kmap)
        spmat = sparse.lil_matrix((total, num_labels))
        for j, val in enumerate(c):
            if val in kmap:
                    spmat[j, kmap[val]] = 1
        hot_data.append(spmat)
    hot_data = sparse.hstack(hot_data).tocsr()
    return hot_data, keys

In [16]:
def drop_ones(data):
    ones = list()
    for c in data.columns:
        if data[c].sum() == 1:
            ones.append(c)
    return data.drop(ones, axis=1)

In [35]:
def group_data(data, degree=3, hash=hash):
    """   
    Groups all columns of data into all combinations of triples
    """
    new_data = []
    m,n = data.shape
    for indicies in combinations(range(n), degree):
        if 5 in indicies and 7 in indicies:
            pass
        elif 2 in indicies and 3 in indicies:
            pass
        else:
            new_data.append([hash(tuple(v)) for v in data[:,indicies]])
    return np.array(new_data).T

# LinearSVC

In [19]:
from sklearn.svm import LinearSVC

## Data Not Preprocessed

### Default Parameters

In [35]:
lsvc = svm.LinearSVC()
y_pred = pd.DataFrame(lsvc.fit(X, y).predict(X_test))

In [42]:
y_pred.columns = ['ACTION']
test1 = pd.concat([IDs, y_pred], axis=1)
test1.to_csv("../results/test1.csv")

### With Parameters Evaluated

In [None]:
from sklearn import grid_search as GS

parameters = {'C':[0.1, 1, 10, 100, 1000], 'loss': ['squared_hinge', 'hinge']}

## Data Preprocessed (Initial)

In [None]:
all_data = pd.concat([X, X_test], ignore_index=True)
final_data = one_hot(all_data)

final_data = drop_ones(final_data)
X, X_test = final_data[:32769], final_data[32769:]

### Default Parameters

In [10]:

from sklearn.cross_validation import train_test_split as split
X_train, X_val, y_train, y_val = split(X, y, test_size=0.33, random_state=42)

lsvc = svm.LinearSVC()
lsvc.fit(X_train, y_train)
score = lsvc.score(X_val, y_val)
print score

0.946273349362


In [12]:
y_pred = pd.DataFrame(lsvc.fit(X, y).predict(X_test))
y_pred.columns = ['ACTION']
test2 = pd.concat([IDs, y_pred], axis=1)
test2.to_csv("../results/test2.csv", index=False)

## New Approach With Feature Extraction

In [128]:
X, X_test = get_data()
all_data = np.vstack((X.ix[:,1:-1], X_test.ix[:,1:-1]))
num_train = np.shape(X)[0]

### Use Label Encoder Initially Instead of One Hot

In [129]:
le = LE()
for column in range(all_data.shape[1]):
    le.fit(all_data[:, column])
    all_data[:, column] = le.transform(all_data[:, column])

### Get First Order Features

In [130]:
for c in range(all_data.shape[1]):
    le.fit(all_data[:, c])
    all_data[:, c] = le.transform(all_data[:, c])
    uniques = len(set(all_data[:,c]))
    maximum = max(all_data[:,c])
    if maximum < 65534:
        count_map = np.bincount((all_data[:, c]).astype('uint16'))
        for n,i in enumerate(all_data[:, c]):
            if count_map[i] <= 1:
                all_data[n, c] = uniques
            elif count_map[i] == 2:
                all_data[n, c] = uniques+1
    else:
        for n,i in enumerate(all_data[:, c]):
            if (all_data[:, c] == i).sum() <= 1:
                all_data[n, c] = uniques
            elif (all_data[:, c] == i).sum() == 2:
                all_data[n, c] = uniques+1
    uniques = len(set(all_data[:,c]))
    le.fit(all_data[:, c])
    all_data[:, c] = le.transform(all_data[:, c])

### Get Second Order Features

In [131]:
sec = group_data(all_data, degree=2) 
for c in range(sec.shape[1]):
    le.fit(sec[:, c])
    sec[:, c] = le.transform(sec[:, c])
    uniques = len(set(sec[:,c]))
    maximum = max(sec[:,c])
    if maximum < 65534:
        count_map = np.bincount((sec[:, c]).astype('uint16'))
        for n,i in enumerate(sec[:, c]):
            if count_map[i] <= 1:
                sec[n, c] = uniques
            elif count_map[i] == 2:
                sec[n, c] = uniques+1
    else:
        for n,i in enumerate(sec[:, c]):
            if (sec[:, c] == i).sum() <= 1:
                sec[n, c] = uniques
            elif (sec[:, c] == i).sum() == 2:
                sec[n, c] = uniques+1
    uniques = len(set(sec[:,c]))
    le.fit(sec[:, c])
    sec[:, c] = le.transform(sec[:, c])

In [132]:
third = group_data(all_data, degree=3)
for c in range(third.shape[1]):
    le.fit(third[:, c])
    third[:, c] = le.transform(third[:, c])
    uniques = len(set(third[:,c]))
    maximum = max(third[:,c])
    if maximum < 65534:
        count_map = np.bincount((third[:, c]).astype('uint16'))
        for n,i in enumerate(third[:, c]):
            if count_map[i] <= 1:
                third[n, c] = uniques
            elif count_map[i] == 2:
                third[n, c] = uniques+1
    else:
        for n,i in enumerate(third[:, c]):
            if (third[:, c] == i).sum() <= 1:
                third[n, c] = uniques
            elif (third[:, c] == i).sum() == 2:
                third[n, c] = uniques+1
    uniques = len(set(third[:,c]))
    le.fit(third[:, c])
    third[:, c] = le.transform(third[:, c])

In [133]:
# Collect the training features together
y = np.array(X.ACTION)
X = all_data[:num_train]
X_2 = sec[:num_train]
X_3 = third[:num_train]

# Collect the testing features together
X_test = all_data[num_train:]
X_test_2 = sec[num_train:]
X_test_3 = third[num_train:]

allX_train = np.hstack((X, X_2, X_3))
allX_test = np.hstack((X_test, X_test_2, X_test_3))
num_features = allX_train.shape[1]

In [134]:
X_hot = [one_hot_separate(allX_train[:,[i]])[0] for i in range(num_features)]

### Greedy Feature Selection Using AUC Scores

In [140]:
lr = LR(class_weight='balanced', penalty='l2')

def cv_loop(X, y, model, N):
    mean_auc = 0.
    for i in range(N):
        X_train, X_cv, y_train, y_cv = cv.train_test_split(
                                       X, y, test_size=1.0/float(N), 
                                       random_state = i*SEED)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_cv)[:,1]
        auc = metrics.roc_auc_score(y_cv, preds)
        mean_auc += auc
    return mean_auc/N    

score_hist = []
N = 10
good_features = set([])
# Greedy feature selection loop
while len(score_hist) < 2 or score_hist[-1][0] > score_hist[-2][0]:
    scores = []
    for f in range(len(X_hot)):
        if f not in good_features:
            feats = list(good_features) + [f]
            X_partial = sparse.hstack([X_hot[j] for j in feats]).tocsr()
            score = cv_loop(X_partial, y, lr, N)
            scores.append((score, f))
    good_features.add(sorted(scores)[-1][1])
    score_hist.append(sorted(scores)[-1])

In [143]:
# Remove last added feature from good_features
good_features.remove(score_hist[-1][1])
good_features = sorted(list(good_features))
print "Selected features %s" % good_features

Selected features [0, 7, 8, 28, 34, 36, 50, 60, 61, 62, 64, 69]


### Grid Search for best hyperparams (not technically sklearn's grid search)

In [157]:
score_hist = []
X_partial = sparse.hstack([X_hot[j] for j in good_features]).tocsr()
C_list = np.logspace(-5, 5, 10, base=2)
for C in C_list:
    score = cv_loop(X_partial, y, lr, N)
    score_hist.append((score,C))
    print "C: %f Mean AUC: %f" %(C, score)
bestC = sorted(score_hist)[-1][1]
print "Best C value: %f" % (bestC)

C: 0.031250 Mean AUC: 0.909181
C: 0.067504 Mean AUC: 0.909181
C: 0.145816 Mean AUC: 0.909181
C: 0.314980 Mean AUC: 0.909181
C: 0.680395 Mean AUC: 0.909181
C: 1.469734 Mean AUC: 0.909181
C: 3.174802 Mean AUC: 0.909181
C: 6.857952 Mean AUC: 0.909181
C: 14.813995 Mean AUC: 0.909181
C: 32.000000 Mean AUC: 0.909181
Best C value: 32.000000


In [158]:
bestC

32.0

In [161]:
X_best_feats = np.vstack((allX_train[:,good_features], allX_test[:,good_features]))
X_best_feats, keymap = one_hot_separate(X_best_feats)
X_train = X_best_feats[:num_train]
X_test = X_best_feats[num_train:]

In [164]:
model = LR(C=bestC, class_weight='balanced', penalty='l2')
model.fit(X_train, y)
preds = model.predict_proba(X_test)[:,1]

content = ['id,ACTION']
for i, p in enumerate(preds):
    content.append('%i,%f' %(i+1,p))
f = open("finalTestSubmission.csv", 'w')
f.write('\n'.join(content))
f.close()
preds = model.predict_proba(X_train)[:,1]
content = ['id,ACTION']
for i, p in enumerate(preds):
    content.append('%i,%f' %(i+1,p))
f = open("finalTrainSubmission.csv", 'w')
f.write('\n'.join(content))
f.close()