In [1]:
"""
Michael E. Ramsey
CSCI 5352
Date Created: 11/05/18
Last Edited: 12/05/18

This is a python script to create edge prediction models with the features created by:
"Facebook_Feature_Generator.ipynb".

I implement several different machine learning models and check out aspects of all.
"""

# Get necessary libraries
import sys
import io
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from random import randint
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

  from ._conv import register_converters as _register_converters


In [2]:
"""
Extract all filenames for facebook100 dataset
"""
filepath = "Facebook_Features_97/"

# Get list of filenames that contain edge information
# Had to exclude a bunch of files that I did not need
# Could have done this more efficiently
files = [f for f in listdir(filepath) if isfile(join(filepath, f))]
len(files)

30

In [3]:
"""
Loop through all files and compute accuracy, recall, precision
"""

# Empty data frame to store features
feature_df = pd.DataFrame()

# Number of models we are running
models_running = 6

# Loop though files and store edges
for filename in files:
    
    """
    Load the data 
    """
    # Print for tracking
    print(filename, end = ", ")
    
    # Load the data
    data = pd.read_csv(filepath + filename)
    
    # Create column for month
    data['school'] = [filename[:-4]]*len(data)
    
    # Sample 70% of the rows
    data = data.sample(frac=0.7)

    # Append to data frame
    feature_df = feature_df.append(data)
    


American75.csv, Amherst41.csv, Bowdoin47.csv, Brandeis99.csv, Bucknell39.csv, Caltech36.csv, Colgate88.csv, Hamilton46.csv, Haverford76.csv, Howard90.csv, Johns Hopkins55.csv, Lehigh96.csv, Middlebury45.csv, MIT8.csv, Oberlin44.csv, Pepperdine86.csv, Reed98.csv, Rice31.csv, Rochester38.csv, Santa74.csv, Smith60.csv, Swarthmore42.csv, Trinity100.csv, USFCA72.csv, Vassar85.csv, Vermont70.csv, Wellesley22.csv, Wesleyan43.csv, William77.csv, Williams40.csv, 

In [4]:
""" 
View the data
"""
feature_df.head()

Unnamed: 0.1,Unnamed: 0,Node_1,Node_2,shortest_path,common_neighbors,pref_attach,neighbor_sum,sorensen,cosine_sim,hub_prom,...,same_status,same_major,same_dorm,same_year,local_path001,local_path01,local_path1,edge,label,school
260963,260963,5849,1094,1.0,24.0,2772.0,107.0,0.224299,0.455842,0.545455,...,0.0,0.0,0.0,0.0,24.917,33.17,115.7,1.0,Tr,American75
748383,748383,1841,2841,4.0,0.0,12.0,13.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tr,American75
836603,836603,462,3723,2.0,1.0,3306.0,125.0,0.008,0.017392,0.026316,...,1.0,0.0,0.0,0.0,1.037,1.37,4.7,0.0,Tr,American75
265978,265978,2211,3517,1.0,6.0,2318.0,99.0,0.060606,0.124622,0.157895,...,1.0,1.0,1.0,1.0,6.242,8.42,30.2,1.0,Tr,American75
769650,769650,696,3276,3.0,0.0,740.0,57.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.003,0.03,0.3,0.0,Tr,American75


In [5]:
"""
Process the data
"""

# Delete the first column
feature_df = feature_df.drop(columns = ['Unnamed: 0'])
print('First column dropped')
#feature_df.head()

# Drop lhn and adamic-adar
feature_df = feature_df.drop(columns = ['lhn','adamic_adar'])

# Drop rows with nas
#data = data.replace([np.inf, -np.inf], np.nan)
feature_df = feature_df.dropna()
print('NAs dropped')
#feature_df.head()
#data = data[pd.notnull(data)]

# Standardize the data
temp = feature_df[['same_gender', 'same_status','same_major','same_dorm','same_year','edge','label', 'school']]
print('Extraction complete')
#temp.head()
feature_df = feature_df.drop(columns = ['Node_1','Node_2','same_gender','same_status','same_major','same_dorm','same_year','edge','label','school'])
print('Columns dropped')
#feature_df.head()
names = list(feature_df)
feature_df = feature_df.values #returns a numpy array
print('Converted to numpy')
min_max_scaler = preprocessing.MinMaxScaler()
feature_df_scaled = min_max_scaler.fit_transform(feature_df)
print('Data scaled')
feature_df = pd.DataFrame(feature_df_scaled)
print('Converted to pandas')
#feature_df.head()
feature_df.columns = names
#feature_df.head()
feature_df = feature_df.join(temp.reset_index())
print('Data joined')

# Drop the index column
feature_df = feature_df.drop(columns = ['index'])
feature_df = feature_df.drop(columns = ['shortest_path'])

# Drop shortest path column

First column dropped
NAs dropped
Extraction complete
Columns dropped
Converted to numpy
Data scaled
Converted to pandas
Data joined


In [6]:
""" 
View the data
"""
feature_df.head()

Unnamed: 0,common_neighbors,pref_attach,neighbor_sum,sorensen,cosine_sim,hub_prom,hub_depr,resource_all,local_cluster_sum,local_cluster_prod,...,local_path01,local_path1,same_gender,same_status,same_major,same_dorm,same_year,edge,label,school
0,0.036923,0.001923,0.038889,0.448598,0.455842,0.545455,0.380952,0.024593,0.327409,0.106239,...,0.026662,0.017555,1.0,0.0,0.0,0.0,0.0,1.0,Tr,American75
1,0.0,8e-06,0.004074,0.0,0.0,0.0,0.0,0.0,0.068182,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,Tr,American75
2,0.001538,0.002294,0.045556,0.016,0.017392,0.026316,0.011494,0.000627,0.167842,0.027653,...,0.001101,0.000713,1.0,1.0,0.0,0.0,0.0,0.0,Tr,American75
3,0.009231,0.001608,0.035926,0.121212,0.124622,0.157895,0.098361,0.009088,0.11232,0.012217,...,0.006768,0.004582,0.0,1.0,1.0,1.0,1.0,1.0,Tr,American75
4,0.0,0.000513,0.02037,0.0,0.0,0.0,0.0,0.0,0.191924,0.034353,...,2.4e-05,4.6e-05,1.0,0.0,0.0,0.0,0.0,0.0,Tr,American75


In [7]:
""" 
Separate into train/valid/test
"""
train = feature_df[feature_df['label'] == 'Tr']
valid = feature_df[feature_df['label'] == 'V']
test = feature_df[feature_df['label'] == 'T']

# Separate out values
y_train = train['edge']
s_train = train['school']
x_train = train.drop(columns = ['edge', 'label', 'school'])
y_test = test['edge']
s_test = test['school']
x_test = test.drop(columns = ['edge', 'label', 'school'])
y_valid = valid['edge']
s_valid = valid['school']
x_valid = valid.drop(columns = ['edge', 'label', 'school'])

In [8]:
"""
Logistic Regression Classifier
"""

# Create the model and display the loss function
logistic = SGDClassifier(random_state=345, loss = 'log', max_iter = 10000, tol = .001, verbose = 1)
clf = logistic.fit(x_train, y_train)

# Construct confusion matrix for each group
log_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    log_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    log_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    log_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1
    
# Display the results
log_res

-- Epoch 1
Norm: 15.34, NNZs: 18, Bias: -1.466929, T: 10856732, Avg. loss: 0.268100
Total training time: 7.62 seconds.
-- Epoch 2
Norm: 15.31, NNZs: 18, Bias: -1.492893, T: 21713464, Avg. loss: 0.267461
Total training time: 14.14 seconds.
Convergence after 2 epochs took 14.14 seconds


array([[0.898211, 0.834563, 0.955944],
       [0.882869, 0.903487, 0.868862],
       [0.885296, 0.896482, 0.878065],
       [0.896816, 0.875565, 0.915808],
       [0.903073, 0.879855, 0.923077],
       [0.891228, 0.920266, 0.879365],
       [0.894883, 0.886288, 0.901361],
       [0.897772, 0.923632, 0.880734],
       [0.858256, 0.928504, 0.821131],
       [0.867267, 0.876367, 0.861394],
       [0.906899, 0.867356, 0.941235],
       [0.905079, 0.869440, 0.937790],
       [0.898266, 0.895247, 0.901068],
       [0.901745, 0.852622, 0.944231],
       [0.887508, 0.865471, 0.904284],
       [0.894011, 0.895911, 0.895578],
       [0.880383, 0.929487, 0.845481],
       [0.895478, 0.869245, 0.919182],
       [0.909172, 0.873449, 0.941896],
       [0.897059, 0.895278, 0.899044],
       [0.884987, 0.837398, 0.927331],
       [0.871044, 0.908484, 0.843363],
       [0.890266, 0.912591, 0.872203],
       [0.889180, 0.849611, 0.924741],
       [0.891053, 0.886965, 0.894277],
       [0.882565, 0.80006

In [9]:
# Save to csv
mytable = pd.DataFrame(data=log_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('log97' + '.csv')

In [10]:
"""
SVM Classifier
"""

# Create the model and display the loss function
svm = SGDClassifier(random_state=345, loss = 'hinge', max_iter = 10000, tol = .001, verbose = 1)
clf = svm.fit(x_train, y_train)

# Construct confusion matrix for each group
svm_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    svm_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    svm_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    svm_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1
    
# Display the results
svm_res

-- Epoch 1
Norm: 10.86, NNZs: 18, Bias: -0.917161, T: 10856732, Avg. loss: 0.261778
Total training time: 4.67 seconds.
-- Epoch 2
Norm: 10.82, NNZs: 18, Bias: -0.973807, T: 21713464, Avg. loss: 0.260114
Total training time: 9.28 seconds.
-- Epoch 3
Norm: 10.85, NNZs: 18, Bias: -0.931382, T: 32570196, Avg. loss: 0.260067
Total training time: 13.95 seconds.
Convergence after 3 epochs took 13.95 seconds


array([[0.902946, 0.847471, 0.952903],
       [0.881929, 0.914695, 0.859567],
       [0.888700, 0.916779, 0.869147],
       [0.900124, 0.893634, 0.906667],
       [0.903437, 0.891833, 0.913383],
       [0.885965, 0.930233, 0.864198],
       [0.895810, 0.900780, 0.891504],
       [0.894416, 0.936861, 0.866037],
       [0.855334, 0.941675, 0.809871],
       [0.868954, 0.895150, 0.851240],
       [0.911903, 0.884089, 0.935608],
       [0.909564, 0.886944, 0.930464],
       [0.901272, 0.910014, 0.894737],
       [0.907140, 0.868792, 0.939780],
       [0.889739, 0.883408, 0.893714],
       [0.894956, 0.911524, 0.885199],
       [0.881978, 0.948718, 0.836158],
       [0.900279, 0.885820, 0.913870],
       [0.915061, 0.892237, 0.936036],
       [0.897059, 0.907083, 0.889802],
       [0.889669, 0.857143, 0.918482],
       [0.871044, 0.927550, 0.831624],
       [0.890524, 0.930281, 0.860443],
       [0.893543, 0.867761, 0.916895],
       [0.893939, 0.905243, 0.885230],
       [0.889642, 0.81498

In [11]:
# Save to csv
mytable = pd.DataFrame(data=svm_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('svm97' + '.csv')

In [12]:
"""
Perceptron Classifier
"""

# Create the model and display the loss function
svm = SGDClassifier(random_state=345, loss = 'perceptron', max_iter = 30, tol = .001, verbose = 0)
clf = svm.fit(x_train, y_train)

# Construct confusion matrix for each group
pec_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    pec_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    pec_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    pec_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1

# Display results
pec_res

array([[0.906628, 0.848788, 0.959500],
       [0.880990, 0.931507, 0.847112],
       [0.891423, 0.939107, 0.858380],
       [0.913978, 0.917454, 0.912209],
       [0.912711, 0.908893, 0.916209],
       [0.880702, 0.943522, 0.847761],
       [0.906748, 0.923449, 0.893242],
       [0.889228, 0.947084, 0.851351],
       [0.849976, 0.968956, 0.789272],
       [0.873453, 0.938604, 0.830975],
       [0.918726, 0.893216, 0.940724],
       [0.917667, 0.896126, 0.937838],
       [0.907977, 0.930318, 0.890853],
       [0.913797, 0.878725, 0.943921],
       [0.902486, 0.920564, 0.887585],
       [0.896089, 0.938290, 0.867950],
       [0.858054, 0.967949, 0.792651],
       [0.906937, 0.901166, 0.913219],
       [0.924340, 0.908189, 0.939494],
       [0.903170, 0.928027, 0.884574],
       [0.896400, 0.860627, 0.928571],
       [0.869154, 0.959009, 0.811290],
       [0.893106, 0.945890, 0.854323],
       [0.916230, 0.906655, 0.925861],
       [0.893458, 0.927369, 0.868468],
       [0.908160, 0.85049

In [13]:
# Save to csv
mytable = pd.DataFrame(data=pec_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('pec97' + '.csv')

In [14]:
"""
KNN Classifier
"""

# Create the model and display the loss function
#neigh = KNeighborsClassifier(n_neighbors=3)
#clf = neigh.fit(x_train, y_train)

# Construct confusion matrix
#y_pred = clf.predict(x_valid)
#mat = confusion_matrix(y_valid, y_pred)

# Compute accuracy, precision, recall
#acc = (mat[0][0] + mat[1][1])/np.sum(mat)
#prec = mat[1][1]/(mat[1][1]+mat[1][0])
#rec = mat[1][1]/(mat[1][1]+mat[0][1])

'\nKNN Classifier\n'

In [15]:
"""
Random Forest Classifier
"""

# Create the model and display the loss function
forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=345, verbose = 0)
clf = forest.fit(x_train, y_train)

# Construct confusion matrix for each group
rf_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    rf_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    rf_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    rf_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1

# Display results
rf_res

array([[0.910310, 0.865648, 0.950260],
       [0.876292, 0.934620, 0.838079],
       [0.887679, 0.941813, 0.850856],
       [0.907155, 0.910472, 0.905637],
       [0.908529, 0.916878, 0.902143],
       [0.863158, 0.963455, 0.812325],
       [0.898406, 0.926421, 0.876891],
       [0.887702, 0.947685, 0.848681],
       [0.845592, 0.964252, 0.786043],
       [0.868954, 0.895991, 0.850679],
       [0.920849, 0.900821, 0.937916],
       [0.916655, 0.904735, 0.928172],
       [0.899191, 0.927088, 0.878443],
       [0.912879, 0.883114, 0.937929],
       [0.897706, 0.912876, 0.885093],
       [0.899679, 0.931227, 0.878639],
       [0.859649, 0.964744, 0.796296],
       [0.902756, 0.906384, 0.901404],
       [0.923269, 0.917760, 0.928956],
       [0.898969, 0.921173, 0.882525],
       [0.896693, 0.896051, 0.898660],
       [0.858290, 0.948522, 0.801773],
       [0.887684, 0.940166, 0.849553],
       [0.903578, 0.892826, 0.914159],
       [0.893218, 0.928812, 0.867086],
       [0.901385, 0.83497

In [16]:
# Save to csv
mytable = pd.DataFrame(data=rf_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('rf97' + '.csv')

In [17]:
"""
Adaboost Classifier
"""

# Create the model and display the loss function
adaboy = AdaBoostClassifier(random_state=345)
clf = adaboy.fit(x_train, y_train)

# Construct confusion matrix for each group
ada_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    ada_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    ada_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    ada_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1

# Display results
ada_res

array([[0.920042, 0.884615, 0.951814],
       [0.880363, 0.931507, 0.846154],
       [0.886658, 0.939783, 0.850582],
       [0.910877, 0.931828, 0.895422],
       [0.906710, 0.911797, 0.902948],
       [0.877193, 0.966777, 0.829060],
       [0.898035, 0.921219, 0.880014],
       [0.891059, 0.948286, 0.853355],
       [0.844618, 0.954845, 0.789269],
       [0.876125, 0.929913, 0.840172],
       [0.923882, 0.909644, 0.935837],
       [0.919693, 0.911621, 0.927862],
       [0.904277, 0.931241, 0.883925],
       [0.919307, 0.897205, 0.937711],
       [0.909815, 0.931454, 0.892025],
       [0.899112, 0.933086, 0.876397],
       [0.861244, 0.961538, 0.800000],
       [0.902292, 0.900859, 0.905026],
       [0.922555, 0.914569, 0.930400],
       [0.901451, 0.928789, 0.881142],
       [0.902839, 0.887340, 0.917167],
       [0.860652, 0.945663, 0.806504],
       [0.891299, 0.940687, 0.854846],
       [0.906632, 0.904927, 0.909644],
       [0.898990, 0.936989, 0.870809],
       [0.912677, 0.85467

In [18]:
# Save to csv
mytable = pd.DataFrame(data=ada_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('ada97' + '.csv')

In [19]:
"""
Naive Bayes Classifier
"""

# Create the model and display the loss function
naive = BernoulliNB()
clf = naive.fit(x_train, y_train)

# Construct confusion matrix for each group
nb_res = np.zeros((len(files),3))
file_names = [x[:-4] for x in files]
counter = 0
for file in file_names:
    min_ind = min(np.where(s_valid == file)[0])
    max_ind = max(np.where(s_valid == file)[0])
    y_pred = clf.predict(x_valid.iloc[min_ind:max_ind])
    mat = confusion_matrix(y_valid.iloc[min_ind:max_ind], y_pred)

    # Compute accuracy, precision, recall
    nb_res[counter][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    nb_res[counter][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    nb_res[counter][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    counter += 1

# Display results
nb_res

array([[0.815755, 0.980506, 0.737176],
       [0.713122, 0.998755, 0.637014],
       [0.726004, 0.997294, 0.647912],
       [0.740902, 0.992608, 0.661829],
       [0.745226, 0.990926, 0.664881],
       [0.728070, 0.993355, 0.661504],
       [0.736003, 0.995169, 0.654928],
       [0.700031, 0.998196, 0.628788],
       [0.658061, 0.996237, 0.602732],
       [0.710349, 0.992150, 0.635254],
       [0.811524, 0.989656, 0.729045],
       [0.793662, 0.991392, 0.712224],
       [0.741040, 0.993078, 0.660731],
       [0.806474, 0.988681, 0.723340],
       [0.757489, 0.988469, 0.674978],
       [0.743057, 0.993680, 0.665588],
       [0.724083, 0.983974, 0.646316],
       [0.734128, 0.992020, 0.656510],
       [0.794254, 0.989365, 0.713081],
       [0.737968, 0.992003, 0.658493],
       [0.744805, 0.993031, 0.665370],
       [0.679263, 0.997140, 0.607433],
       [0.716757, 0.994277, 0.637638],
       [0.816754, 0.982714, 0.739753],
       [0.720539, 0.994709, 0.642436],
       [0.867058, 0.96896

In [20]:
# Save to csv
mytable = pd.DataFrame(data=nb_res,    # values
            index=file_names,    # 1st column as index
            columns=['Accuracy','Precision','Recall'])
mytable.to_csv('nb97' + '.csv')