In [1]:
# Import all the necessary libraries
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
import networkx as nx

In [2]:
# Load the train and test data

def load_train_data():
    with open('data/train.csv') as f:
        features = f.next().rstrip().split(',')[1:]
        data = np.loadtxt(f, delimiter=',')
        X, Y = data[:,1:], data[:,:1]
        return features, X, Y.flatten()

def load_test_data():
    with open('data/test.csv') as f:
        features = f.next().rstrip().split(',')[1:]
        return np.loadtxt(f, delimiter=',')

features, X, Y = load_train_data()
test_data = load_test_data()

In [3]:
## Useful functions

# Pairwise transform:
# Divide each row in two parts, one for each person in the comparison and subtract the logs of them.
# A +1 factor is used to have only non-negative logs.
def trans(x):
    return np.log(1 + x[:,:11]) - np.log(1 + x[:,11:] )

# Rolling-hash on the features of a person
def hash(a):
    base = 100
    res = 0
    for i in range(len(a)):
        res += base ** i * a[i]
    return res

# Creates an undirected graph where all the rows, which are comparisons, are represented by edges.
def get_edges(x):
    A = x[:,:11]
    B = x[:,11:]

    mapa = {}
    tot = 0
    edges = []
    adj = {}
    for i in range(len(A)):
        a = hash(list(A[i]))
        if a not in mapa:
            mapa[a] = tot
            adj[tot] = set()
            tot += 1
        b = hash(list(B[i]))
        if b not in mapa:
            mapa[b] = tot
            adj[tot] = set()
            tot += 1
        edges.append((mapa[a], mapa[b]))
        adj[mapa[a]].add(mapa[b])
        adj[mapa[b]].add(mapa[a])
    return edges, tot, adj

# Given the probability predictions, print the output in the format Kaggle can accept.
def generate_test_output(pred_probs):
    with open("test_labeled.csv", "w") as f:
        f.write('Id,Choice\n')
        for i,prob in enumerate(pred_probs):
            f.write(str(i+1)+','+str(0.6 if prob[1] == 0.5 else prob[1])+'\n')
            
# Pairwise transform for simple numbers
def subtractlogs(a, b):
    return np.log(1 + a) - np.log(1 + b)

# Given two sets of features, create a graph of their combination
def create_graph(x1, x2):
    x = np.concatenate((x1, x2))
    edges, n, adj = get_edges(x)
    ug = nx.Graph()
    map(lambda i: ug.add_node(i), range(n))
    map(lambda (a, b): ug.add_edge(a, b), edges)
    return ug

In [4]:
# Create an undirected graph of the combination of the train and test data
ug = create_graph(trans(X), trans(test_data))

In [5]:
# Calculate several graph centrality measures
pr = nx.pagerank(ug)
degc = nx.degree_centrality(ug)
close = nx.closeness_centrality(ug)
load = nx.load_centrality(ug)
bet = nx.betweenness_centrality(ug)
ecc = nx.eccentricity(ug)

In [6]:
def network_features(x1, x2):
    x = np.concatenate((x1, x2))
    edges, n, adj = get_edges(x)
    features = []
    for i in range(len(x)):
        a = edges[i][0]
        b = edges[i][1]
        features.append([
            subtractlogs(degc[a], degc[b]),
            subtractlogs(close[a], close[b]),
            subtractlogs(len(adj[a]), len(adj[b])),
            subtractlogs(load[a], load[b]),
            subtractlogs(bet[a], bet[b]),
            subtractlogs(ecc[a], ecc[b]),
            subtractlogs(pr[a], pr[b])
        ])
    return features[:len(x1)], features[len(x1):]

x_feat, test_x_feat = network_features(trans(X), trans(test_data))

In [7]:
# Combine the original features with the network features
x = trans(X)
test_x = trans(test_data)
x_feat, test_x_feat = network_features(x, test_x)

x = np.concatenate((x, x_feat), axis=1)
test_x = np.concatenate((test_x, test_x_feat), axis=1)

In [8]:
# For every feature, make a negative copy of it, so that there's a balance between all the cases were A < B and B > A.
xx = []
yy = []
for i in range(len(Y)):
    xx.append(x[i])
    yy.append(Y[i])
    xx.append(-x[i])
    yy.append(1 - Y[i])

# For every row, append the ratio following / followers as a new feature
for i in range(len(xx)):
    xx[i] = np.append(xx[i], xx[i][0] / (xx[i][1] + 1)) 
tx = []
for i in range(len(test_x)):
    tx.append(np.append(test_x[i], test_x[i][0] / (test_x[i][0] + 1)))

In [9]:
# First classifier, Bagging + LR
clf = BaggingClassifier(
    LogisticRegression(C=1/.60),
    max_samples=0.3, max_features=0.7, n_estimators=200, n_jobs=-1, random_state=3)
clf.fit(xx, yy)
proba1 = clf.predict_proba(tx)

# Second classifier, Bagging + GBC
clf = BaggingClassifier(
    GradientBoostingClassifier(max_depth=2, max_features=.99, subsample=0.9),
    max_samples=0.3, max_features=0.7, n_estimators=50, n_jobs=-1, random_state=1
)
clf.fit(xx, yy)
proba2 = clf.predict_proba(tx)

# Combine both results as a weigthed average.
generate_test_output(proba1 * .72 + proba2 * .28)