# Influencers in Social Networks
Albert Byun, Hussein Danish, Nick Hamlin, Vincent Chio, Walter Erquinigo Pezo

## Background 

See https://www.kaggle.com/c/predict-who-is-more-influential-in-a-social-network/data for more info on the problem setup. An example solution is available at https://gist.github.com/fhuszar/5372873

In [108]:
# This tells matplotlib not to try opening a new window for each plot.
%matplotlib inline

# General libraries.
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.grid_search import GridSearchCV

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report

## Data Ingest

In [109]:
def load_train_data():
    with open('data/train.csv') as f:
        features = f.next().rstrip().split(',')[1:]
        data = np.loadtxt(f, delimiter=',')
        X, Y = data[:,1:], data[:,:1]
        return features, X, Y.flatten()

def load_test_data():
    with open('data/test.csv') as f:
        features = f.next().rstrip().split(',')[1:]
        return np.loadtxt(f, delimiter=',')

def prepare_train_data(X, Y, test_size=1.0/2):
    np.random.seed(0)
    
    shuffle = np.random.permutation(np.arange(X.shape[0]))
    X, Y = X[shuffle], Y[shuffle]
    break_point = int(X.shape[0] * (1 - test_size))
    return X[:break_point], Y[:break_point], X[break_point:], Y[break_point:]


features, X, Y = load_train_data()
train_data, train_labels, dev_data, dev_labels = prepare_train_data(X, Y)
test_data = load_test_data()

train_data = np.array(train_data)
dev_data = np.array(train_data)
test_data = np.array(test_data)

print train_data.shape, dev_data.shape, test_data.shape

(2750, 22) (2750, 22) (5952, 22)


In [110]:
def transform_features(x):
    return np.log(1+x)

def subtract(x):
    return np.array([xx[0:11] - xx[11:] for xx in x])

## KNN

In [111]:
def knn(x_train, y_train, x_test, y_test):
    kn = KNeighborsClassifier(n_neighbors=10)
    kn.fit(x_train, y_train)
    return kn.score(x_test, y_test)

print knn(
    subtract(transform_features(dev_data)), 
    dev_labels, 
    subtract(transform_features(train_data)),
    train_labels)
print knn(
    subtract(dev_data), 
    dev_labels, 
    subtract(train_data),
    train_labels)

0.485090909091
0.494181818182


In [112]:
from sklearn import linear_model

def logistic(x_train, y_train, x_test, y_test):
    clf = logistic = linear_model.LogisticRegression(fit_intercept=False)
    clf.fit(x_train, y_train)
    return clf.score(x_test, y_test)

print logistic(
    subtract(transform_features(dev_data)), 
    dev_labels, 
    subtract(transform_features(train_data)),
    train_labels)
print logistic(
    subtract(dev_data), 
    dev_labels, 
    subtract(train_data),
    train_labels)

0.463272727273
0.450545454545


## ISOMAP

In [113]:
from sklearn import manifold

def isomap(n_neighbors, n_components):
    isomap = manifold.Isomap(n_neighbors, n_components)
    dev_x = isomap.fit_transform(subtract(transform_features(dev_data)))
    train_x = isomap.transform(subtract(transform_features(train_data)))
    return knn(
        dev_x, 
        dev_labels, 
        train_x,
        train_labels)

def isomap2(n_neighbors, n_components):
    isomap = manifold.Isomap(n_neighbors, n_components)
    dev_x = isomap.fit_transform(subtract(transform_features(dev_data)))
    train_x = isomap.transform(subtract(transform_features(train_data)))
    return logistic(
        dev_x, 
        dev_labels, 
        train_x,
        train_labels)
    
for nei in range(2, 5):
    for com in range(3, 8):
        print "knn %d %d %f" % (nei, com, isomap(nei, com))
        print "log %d %d %f" % (nei, com, isomap2(nei, com))   

knn 2 3 0.476364
log 2 3 0.452727
knn 2 4 0.486909
log 2 4 0.498545


KeyboardInterrupt: 

In [115]:
from sklearn.decomposition import PCA

def pca(n_components):
    isomap = PCA(n_components)
    dev_x = isomap.fit_transform(subtract(transform_features(dev_data)))
    train_x = isomap.transform(subtract(transform_features(train_data)))
    return knn(
        dev_x, 
        dev_labels, 
        train_x,
        train_labels)

def pca2(n_components):
    isomap = PCA(n_components)
    dev_x = isomap.fit_transform(subtract(transform_features(dev_data)))
    train_x = isomap.transform(subtract(transform_features(train_data)))
    return logistic(
        dev_x, 
        dev_labels, 
        train_x,
        train_labels)
    
for com in range(2, 10):
    print "knn %d %f" % (com, pca(com))
    print "log %d %f" % (com, pca2(com)) 

knn 2 0.487636
log 2 0.413455
knn 3 0.476364
log 3 0.415273
knn 4 0.490545
log 4 0.447636
knn 5 0.480000
log 5 0.426909
knn 6 0.485818
log 6 0.433818
knn 7 0.486909
log 7 0.441818
knn 8 0.492364
log 8 0.442545
knn 9 0.490182
log 9 0.449818


## Export Results for Kaggle Submission

In [118]:
def generate_test_output(pred_probs):
    with open("test_labeled.csv", "w") as f:
        f.write('Id,Choice\n')
        for i,prob in enumerate(pred_probs):
            f.write(str(i+1)+','+str(prob[1])+'\n')


isomap = manifold.Isomap(4, 7)
x = isomap.fit_transform(subtract(transform_features(X)))
test_x = isomap.transform(subtract(transform_features(test_data)))

clf = logistic = KNeighborsClassifier(n_neighbors=3)
clf.fit(x, Y)
proba = clf.predict_proba(test_x)
    
generate_test_output(proba)

In [None]:
#best

def generate_test_output(pred_probs):
    with open("test_labeled.csv", "w") as f:
        f.write('Id,Choice\n')
        for i,prob in enumerate(pred_probs):
            f.write(str(i+1)+','+str(prob[1])+'\n')


isomap = manifold.Isomap(4, 7)
x = isomap.fit_transform(subtract(transform_features(X)))
test_x = isomap.transform(subtract(transform_features(test_data)))

clf = logistic = linear_model.LogisticRegression(fit_intercept=False)
clf.fit(x, Y)
proba = clf.predict_proba(test_x)
    
generate_test_output(proba)