---

# CSCI 3352, Spring 2022
# FINAL PROJECT

<br> 

### Owen Smith, Kyle Ma

<br> 

In [10]:
import os
import numpy as np
import pandas as pd
import networkx as nx
import math
import matplotlib
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeRegressor
from sklearn import linear_model
from pyvis.network import Network
import csv

In [11]:
### ITERATE THROUGH NETWORKS DIRECTORY, HANDLE FILES ###
directory = 'networks'
for filename in os.scandir(directory):
    filename = filename.name
    name, ext = os.path.splitext(filename)
    ext = str(ext)[0:4]
    if ext == '.csv' and filename != 'references.csv':
        data = np.loadtxt(open(directory + '/' + filename, "rb"), delimiter=",", skiprows=1)
        edgelist = ''
        listfile = open(directory + '/' + filename[:-4] + '.txt', 'w')
        for row in range(0, len(data)):
            for col in range(0, len(data[row])):
                if data[row][col] != 0:
                    edgelist = edgelist + str(row) + ' ' + str(col) + '\n'
        d = listfile.write(edgelist)
        listfile.close()

In [12]:
# takes in a graph G and returns a list of network statistics
def gather_network_stats(G):
    def compute_MGD(G):
        path = list((dict(nx.all_pairs_shortest_path_length(G)).values())) #takes return and converts to list of dicts
        Z = 0
        l_sum = 0

        for node in path:
            tmp = list(node.values()) #convert node dict to list of values
            tmp.remove(0) #remove all instances of 0
            Z = Z + len(tmp) #calculate Z from size of list
            l_sum = l_sum + sum(tmp) #add all the path lengths to the total for the numerator later on

        MGD = l_sum / Z
        return MGD
    
    def percent_single_degree(G): #gets percentage of nodes with degree of one, as these edge suggest specialized predation
        degrees = dict(G.degree())
        degrees = list(degrees.values())
        single_degree_count = 0
        for k in degrees:
            if k == 1:
                single_degree_count += 1
        return single_degree_count / G.number_of_nodes()

    def count_FFBL_motifs(G,flag):
        FFL_count = 0
        FBL_count = 0

        ### MY CODE ###
        FFL_pairs = [] #lists that will contain pairs for later printing if flag == 1
        FBL_pairs = []

        for i in G.nodes(): #loop for suggested enumeration
            for j in G.neighbors(i): #paths of length 1
                for k in G.neighbors(j): #paths of length 2
                    if (i != k): #eliminate bidirectionals
                        if (G.has_edge(i, k)): #if there is an edge from i -> k, it is a FFL
                            FFL_count += 1
                            FFL_pairs.append(((i,j), (j,k), (i,k)))
                        if (G.has_edge(k, i)): #if there is an edge from k -> i, it is a FBL
                            FBL_count += 1
                            FBL_pairs.append(((i,j), (j,k), (k,i)))

        FBL_count = int(FBL_count / 3) #divide FBL_count by 3 to retroactively account for duplicates

        if (flag == 1): #print flag
            for i in FFL_pairs:
                print("FFL: ", i)
            FBL_pairs = FBL_pairs[::3] #takes every third element to get rid of duplicates (kinda jank but it works)
            for i in FBL_pairs:
                print("FBL:", i)

        return (FFL_count,FBL_count)
    
    n = G.number_of_nodes() #builtin function that returns number of nodes as an int
    m = G.number_of_edges() #builtin funciton that reutnrs number of edges as an int
    
    degrees = dict(G.degree())
    totalDegrees = sum(degrees.values()) #get values, sum, divide by number of nodes
    kmean =  totalDegrees / n #calculate mean
    
    degree_lst = list(degrees.values())
    kmax = max(degree_lst)
    
    C = nx.transitivity(G)
    node_connectivity = nx.node_connectivity(G)
    MGD = compute_MGD(G)
    triangles = sum(nx.triangles(G).values()) / 3
    FFL_count, FBL_count = count_FFBL_motifs(G, 0)
    specialized_nodes = percent_single_degree(G)
    return [n, m, kmean, kmax, C, node_connectivity, MGD, triangles, FFL_count, FBL_count, specialized_nodes]

In [4]:
def get_temp(filename):
    # returns temperature as a float
    df = pd.read_csv("our_references.csv")
    temp = df[["Filename", "Temperature"]]
    temperature = temp.loc[temp["Filename"] == filename,"Temperature"].iloc[0]
    return float(temperature)

In [20]:
def get_correlation_coefficients(data_array):
    # returns correlation coefficients as a list
    df = pd.DataFrame(data_array, columns = ['network', "nodes", "degrees", 'average degrees', 'highest degree', 'Cluster Coeff', 'connectivity', 'MGD', 'triangles', 'FFL', 'FBL', 'special nodes', 'Temperature'])
    correlations = df.corr()['Temperature'][:-1]
    return correlations #this line for returning as dataframe
    #return correlations.to_numpy()
    

In [18]:
# iterates through networks directory and reads each in as networkx graph G
directory = 'networks'
data = []
show = True
for filename in os.scandir(directory):
    filename = filename.name
    name, ext = os.path.splitext(filename)
    ext = str(ext)[0:4]
    if ext == '.txt' and filename != 'README':
        G = nx.read_edgelist(directory + '/' + filename)
        network_stats = [filename]
        network_stats = network_stats + gather_network_stats(G)
        network_stats.append(get_temp(filename[:-4]))
        data.append(network_stats)
        if show:
            net = Network(notebook = True)
            net.from_nx(G)
            net.show("test.html")
            show = False
    # nx.draw(G)
    # plt.show()

In [21]:
get_correlation_coefficients(data)

nodes              0.357052
degrees            0.222279
average degrees   -0.146099
highest degree     0.421098
Cluster Coeff     -0.423891
connectivity       0.296302
MGD               -0.260248
triangles          0.058482
FFL                0.059989
FBL                0.060005
special nodes     -0.169698
Name: Temperature, dtype: float64

In [29]:
df = pd.DataFrame(data, columns = ['network', "nodes", "degrees", 'average degrees', 'highest degree', 'Cluster Coeff', 'connectivity', 'MGD', 'triangles', 'FFL', 'FBL', 'special nodes', 'Temperature'])
X = df[['nodes', 'degrees', 'average degrees', 'highest degree', 'Cluster Coeff', 'connectivity', 'MGD']]
y = df['Temperature']

trainingX = X.iloc[0:28]
trainingY = y.iloc[0:28]
regr = linear_model.LinearRegression()
regr.fit(trainingX, trainingY)

#predict the CO2 emission of a car where the weight is 2300kg, and the volume is 1300cm3:
predictedTemp = regr.predict([X.iloc[29]])

print(predictedTemp) 
print(df.iloc[29])
#alright its over

[64.40785727]
network            FW_017_03.txt
nodes                        143
degrees                      857
average degrees        11.986014
highest degree               116
Cluster Coeff           0.174472
connectivity                   1
MGD                     2.009948
triangles                 1956.0
FFL                        12474
FBL                         4158
special nodes           0.062937
Temperature                 76.8
Name: 29, dtype: object
