In [2]:
"""
Michael E. Ramsey
CSCI 5352
Date Created: 11/05/18
Last Edited: 12/05/18

This is a python script to create edge prediction models with the features created by:
"Facebook_Feature_Single_Model.ipynb".

I implement several different machine learning models and check out aspects of all.
"""

# Get necessary libraries
import sys
import io
from os import listdir
from os.path import isfile, join
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf
import numpy as np
from random import randint
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
np.set_printoptions(formatter={'float_kind':'{:f}'.format})

  from ._conv import register_converters as _register_converters


In [56]:
"""
Extract all filenames for facebook100 dataset
"""
filepath = "Facebook_Features_70/"

# Get list of filenames that contain edge information
# Had to exclude a bunch of files that I did not need
# Could have done this more efficiently
files = [f for f in listdir(filepath) if isfile(join(filepath, f))]
len(files)

30

In [57]:
"""
Loop through all files and compute accuracy, recall, precision
"""

# Empty array to store results
models_running = 6
acc = np.zeros((len(files),models_running))
prec = np.zeros((len(files),models_running))
rec = np.zeros((len(files),models_running))

# Loop though files and calculate results
file_num = 0
for filename in files:
    
    """
    Load the data and standardize
    """
    # Print for tracking
    print(filename)
    
    # Load the data
    data = pd.read_csv(filepath + filename)

    # Delete the first column
    data = data.drop(columns = ['Unnamed: 0'])
    
    # Drop rows with nas
    #data = data.replace([np.inf, -np.inf], np.nan)
    data = data.dropna()
    #data = data[pd.notnull(data)]

    # Standardize the data
    temp = data[['same_gender', 'same_status','same_major','same_dorm','same_year','edge','label']]
    data = data.drop(columns = ['Node_1','Node_2','same_gender','same_status','same_major','same_dorm','same_year','edge','label'])
    names = list(data)
    data = data.values #returns a numpy array
    min_max_scaler = preprocessing.MinMaxScaler()
    data_scaled = min_max_scaler.fit_transform(data)
    data = pd.DataFrame(data_scaled)
    data.columns = names
    data = data.join(temp.reset_index())
    
    """ 
    Separate into train/valid/test
    """
    train = data[data['label'] == 'Tr']
    valid = data[data['label'] == 'V']
    test = data[data['label'] == 'T']

    # Separate out values
    y_train = train['edge']
    x_train = train.drop(columns = ['edge', 'label'])
    y_test = test['edge']
    x_test = test.drop(columns = ['edge', 'label'])
    y_valid = valid['edge']
    x_valid = valid.drop(columns = ['edge', 'label'])
    
    """
    Logistic Regression Classifier
    """

    # Create the model and display the loss function
    logistic = SGDClassifier(random_state=345, loss = 'log', max_iter = 10000, tol = .001, verbose = 0)
    clf = logistic.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][0] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][0] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][0] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('logistic', end = ", ")
    
    """
    SVM Classifier
    """

    # Create the model and display the loss function
    svm = SGDClassifier(random_state=345, loss = 'hinge', max_iter = 10000, tol = .001, verbose = 0)
    clf = svm.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][1] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][1] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][1] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('SVM', end = ", ")
    
    """
    Perceptron Classifier
    """

    # Create the model and display the loss function
    svm = SGDClassifier(random_state=345, loss = 'perceptron', max_iter = 30, tol = .001, verbose = 0)
    clf = svm.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][2] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][2] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][2] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('Perceptron', end = ", ")
    
    """
    KNN Classifier
    """

    # Create the model and display the loss function
    #neigh = KNeighborsClassifier(n_neighbors=3)
    #clf = neigh.fit(x_train, y_train)
    
    # Construct confusion matrix
    #y_pred = clf.predict(x_valid)
    #mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    #acc[file_num][3] = (mat[0][0] + mat[1][1])/np.sum(mat)
    #prec[file_num][3] = mat[1][1]/(mat[1][1]+mat[1][0])
    #rec[file_num][3] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    #print('KNN', end = ", ")
    
    """
    Random Forest Classifier
    """

    # Create the model and display the loss function
    forest = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=345, verbose = 0)
    clf = forest.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][3] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][3] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][3] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('Random Forest', end = ", ")
    
    """
    Adaboost Classifier
    """

    # Create the model and display the loss function
    adaboy = AdaBoostClassifier(random_state=345)
    clf = adaboy.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][4] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][4] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][4] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('Adaboost', end = ", ")
    
    """
    Naive Bayes Classifier
    """

    # Create the model and display the loss function
    naive = BernoulliNB()
    clf = naive.fit(x_train, y_train)
    
    # Construct confusion matrix
    y_pred = clf.predict(x_valid)
    mat = confusion_matrix(y_valid, y_pred)

    # Compute accuracy, precision, recall
    acc[file_num][5] = (mat[0][0] + mat[1][1])/np.sum(mat)
    prec[file_num][5] = mat[1][1]/(mat[1][1]+mat[1][0])
    rec[file_num][5] = mat[1][1]/(mat[1][1]+mat[0][1])
    
    # Print for tracking
    print('Naive Bayes')
    
    """"""
    # Update the file number
    file_num += 1

American75.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Amherst41.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Bowdoin47.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Brandeis99.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Bucknell39.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Caltech36.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Colgate88.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Hamilton46.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Haverford76.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Howard90.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Johns Hopkins55.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Lehigh96.csv
logistic, SVM, Perceptron, Random Forest, Adaboost, Naive Bayes
Middlebury45.csv
logistic, SVM, Perceptron, Random For

In [62]:
"""
Save results to data frame
"""

# Get names of files
names = [x[:-4] for x in files]

# Convert to data frame
Accuracy = pd.DataFrame(data=acc,    # values
            index=names,    # 1st column as index
            columns=['Logistic Regression','SVM','Perceptron','Random Forest','Adaboost','Naive Bayes'])
Precision = pd.DataFrame(data=prec,    # values
            index=names,    # 1st column as index
            columns=['Logistic Regression','SVM','Perceptron','Random Forest','Adaboost','Naive Bayes'])
Recall = pd.DataFrame(data=rec,    # values
            index=names,    # 1st column as index
            columns=['Logistic Regression','SVM','Perceptron','Random Forest','Adaboost','Naive Bayes'])

# Save to csv
Accuracy.to_csv('Accuracy_70.csv')
Precision.to_csv('Precision_70.csv')
Recall.to_csv('Recall_70.csv')