In [129]:
import pickle
import argparse
import time
import itertools
from copy import deepcopy
import numpy as np
from tqdm import tqdm
import pandas as pd
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
import xgboost as xgb

In [120]:
cv_iter = 100
train_iter = 1000
clf_type = 'svm_lin'

category = 'astro-ph'
author_ind = pickle.load( open(category + '_author_ind.pkl', 'rb'))
train_adj_list = pickle.load(open(category + '_train_adj_list.pkl', 'rb'))
test_adj_list = pickle.load(open(category + '_test_adj_list.pkl', 'rb'))
pred_edges = set(test_adj_list) - set(train_adj_list)

# Build dataframes
train_df = pd.read_csv(category + "_train_df.csv")
test_df = pd.read_csv(category + "_test_df.csv")

# Build matrices
train_mat = train_df[train_df.columns[1:-1]].values
train_out = train_df[train_df.columns[-1]].values
test_mat = test_df[test_df.columns[1:-1]].values
test_out = test_df[test_df.columns[-1]].values

In [128]:
def build_model(clf_type, iters, C = None):
    model = None
    params = {'max_iter':iters, 'class_weight':'balanced'}
    if clf_type == 'logreg':
        model = LogisticRegression
    if clf_type == 'svm_lin':
        model = LinearSVC
    elif clf_type == 'svm_rbf':        
        model = SVC
        params['probability']= True
    if C is not None:
        params["C"] = C
    return model(**params)

def make_predictions(clf, test_mat, test_out):
    if clf_type == 'logreg' or clf_type == 'svm_rbf':
        preds = np.exp(clf.predict_log_proba(test_mat))
        probs = np.array([pred[1] for pred in preds])
    else:
        probs = clf.decision_function(test_mat)
    top_ind = probs.argsort()[-len(pred_edges):][::-1]
    print 'Top-k accuracy for %s model: %0.4f' % \
        (clf_type, np.sum(test_out[top_ind]) / float(len(pred_edges)))
    return probs

In [126]:
clf = build_model(clf_type, cv_iter)
C = [0.1, 1.0, 10.0]
parameters = {"C": C}
gs = GridSearchCV(clf, param_grid=parameters, cv=3, scoring="accuracy")
gs.fit(train_mat, train_out)

gs.best_estimator_, gs.best_params_, gs.best_score_, gs.grid_scores_
best_clf = gs.best_estimator_
best_accuracy = gs.best_score_
C_opt = gs.best_params_['C']
print "C_optimal is " + str(C_opt)

clf = build_model(clf_type, train_iter, C=C_opt)
clf = clf.fit(train_mat, train_out)
make_predictions(clf, test_mat, test_out)

C_optimal is 0.1
Top-k accuracy for svm_lin model: 0.0729


array([-0.90940238, -1.43606789, -1.64985883, ..., -1.36671229,
       -2.15432759, -1.62663014])

In [133]:
num_trees = 500
params = {"objective": "binary:logistic",
          #"max_depth": max_depth,
          #"min_child_weight": min_child_weight,
          "silent": 1,
          #"seed": 1,
          #"lambda":lambda_p,
          #"alpha":alpha_p,
          "eval_metric": "error"}

gbm = xgb.train(params, xgb.DMatrix(train_mat, train_out), num_trees)
preds = gbm.predict(xgb.DMatrix(test_mat))

In [137]:
preds
top_ind = preds.argsort()[-len(pred_edges):][::-1]

In [138]:
preds[top_ind]

array([ 0.94574654,  0.91005969,  0.90577954, ...,  0.10172259,
        0.10169885,  0.10168405], dtype=float32)