# High school contact and friendship networks

### Group: Abdelmalek Hajjam, Monu Chacko


Data Source: http://www.sociopatterns.org/datasets/high-school-contact-and-friendship-networks/

This dataset correspond to the contacts and friendship relations between students in a high school in Marseilles, France, in December 2013, as measured through several techniques.

It gives the contacts of the students of nine classes during 5 days in Dec. 2013, as measured by the SocioPatterns infrastructure. The file contains a tab-separated list representing the active contacts during 20-second intervals of the data collection. Each line has the form "t i j Ci Cj", where i and j are the anonymous IDs of the persons in contact, Ci and Cj are their classes, and the interval during which this contact was active. If multiple contacts are active in a given interval, you will see multiple lines starting with the same value of t. Time is measured in seconds.

In [1]:
import pandas as pd
import networkx as nx
import matplotlib.pylab as pyplot
import scipy.stats

import nxviz as nv
from pyvis import network as net
import seaborn as sns
%matplotlib inline

In [2]:
hight_school = pd.read_csv('High-School_data_2013.csv', 
                     delimiter = ' ', 
                     header=None, 
                     names=['timestamp','node_from', 'node_to', 'classname_from', 'classname_to'])


In [3]:
# built the graph from the pandas dataset school
G = nx.from_pandas_edgelist(hight_school, 'node_from', 'node_to', 'timestamp')

In [4]:
numNodes = len(G.nodes())
print("# Nodes: ",numNodes)
print("# Edges: ", len(G.edges()))

#return the list of nodes and their degree of centrality <nx.degree_centrality(G)>
list(nx.degree_centrality(G).items())

# Nodes:  327
# Edges:  5818


[(454, 0.07975460122699386),
 (640, 0.08895705521472393),
 (1, 0.0705521472392638),
 (939, 0.11042944785276074),
 (185, 0.13190184049079756),
 (258, 0.10429447852760737),
 (55, 0.08588957055214724),
 (170, 0.10122699386503067),
 (9, 0.2116564417177914),
 (453, 0.08282208588957056),
 (45, 0.1901840490797546),
 (14, 0.11656441717791412),
 (190, 0.147239263803681),
 (400, 0.06134969325153374),
 (637, 0.07975460122699386),
 (255, 0.09202453987730061),
 (275, 0.20552147239263804),
 (176, 0.18711656441717792),
 (533, 0.06748466257668712),
 (116, 0.1196319018404908),
 (151, 0.1196319018404908),
 (866, 0.15644171779141106),
 (280, 0.08895705521472393),
 (484, 0.0736196319018405),
 (243, 0.11349693251533742),
 (687, 0.08895705521472393),
 (54, 0.12576687116564417),
 (364, 0.12576687116564417),
 (374, 0.11042944785276074),
 (295, 0.10429447852760737),
 (441, 0.11656441717791412),
 (101, 0.13803680981595093),
 (425, 0.09815950920245399),
 (47, 0.046012269938650305),
 (241, 0.10736196319018405),
 

In [5]:
print("Number of Nodes: " + str(nx.number_of_nodes(G)))
print("Number of Edges: " + str(nx.number_of_edges(G)))

#nx.is_connected(G)
#nx.connected_components(G)

Number of Nodes: 327
Number of Edges: 5818


Let us examine the density, diameter and max degree of centrality

In [None]:
print("Density: " + str(nx.density(G)))
print("Diameter: " + str(nx.diameter(G)))
print("Max degree_centrality: ", max(list(nx.degree_centrality(G).values())))

Density: 0.10915367441511416


In [None]:
for n in G.nodes():
    print (str(n) + ' ' + str(len(list(G.neighbors(n)))))

Calculate the eigenvector centrality and degree centrality for each node

In [None]:
# degree centrality
deg_centrality = nx.degree_centrality(G)

# eigenvector centrality
eigen_centrality = nx.eigenvector_centrality(G)

In [None]:
# Add centrality to dataset
deg_centrality = pd.DataFrame.from_dict(deg_centrality, orient = 'index').rename(columns={0:'degree_centrality'})

# since we had to use orient as index for the row indicies, make a column out of the nodes (row indicies)
deg_centrality['node'] = deg_centrality.index

# drop the row indicies
deg_centrality.reset_index(drop=True, inplace=True)

# sort by node id this will be easier to import the classnames for analysis
deg_centrality = deg_centrality.sort_values(by=['node'])

In [None]:
# Do the same for the eigenvector centrality

eigen_centrality = pd.DataFrame.from_dict(eigen_centrality, orient = 'index').rename(columns={0: 'eigenvector_centrality'})
# since we had to use orient as index for the row indicies, make a column out of the nodes (row indicies)
eigen_centrality['node'] = eigen_centrality.index
# drop the row indicies
eigen_centrality.reset_index(drop=True, inplace=True)
# sort by node id so this way we can import the classnames and gender easily
eigen_centrality = eigen_centrality.sort_values(by=['node'])

In [None]:
# Add gender/classname columns
metadata_school = pd.read_csv('HighSchoolMetadata.txt', delimiter = '\t', header = None,
                              names = ['node', 'classname', 'gender'])

metadata_school = metadata_school.sort_values(by=['node'])

In [None]:
deg_centrality['classname'] = metadata_school['classname']
eigen_centrality['classname'] = metadata_school['classname']
deg_centrality['gender'] = metadata_school['gender']
eigen_centrality['gender'] = metadata_school['gender']

In [None]:
# group categories
male_deg = deg_centrality[deg_centrality['gender'] == 'M']
male_eigen = eigen_centrality[eigen_centrality['gender'] == 'M']

female_deg = deg_centrality[deg_centrality['gender'] == 'F']
female_eigen = eigen_centrality[eigen_centrality['gender'] == 'F']

unknown_deg = deg_centrality[deg_centrality['gender'] == 'Unknown']
unknown_eigen = eigen_centrality[eigen_centrality['gender'] == 'Unknown']

In [None]:
# degree and eigenvector centralities vs gender plots
fig, axes = pyplot.subplots(nrows=2, ncols=3)
(ax1, ax2, ax3, ax4, ax5, ax6) = axes.flatten()
ax1.hist(male_deg.degree_centrality)
ax2.hist(female_deg.degree_centrality)
ax3.hist(unknown_deg.degree_centrality)
ax4.hist(male_eigen.eigenvector_centrality)
ax5.hist(female_eigen.eigenvector_centrality)
ax6.hist(unknown_eigen.eigenvector_centrality)
ax1.set_title('M DC')
ax2.set_title('F DC')
ax3.set_title('Un DC')
ax4.set_title('M EC')
ax5.set_title('F EC')
ax6.set_title('Un EC')
pyplot.show()


In [None]:
# t-test
t_test_degree_centrality = scipy.stats.ttest_ind(male_deg.degree_centrality, female_deg.degree_centrality)
print("t-test results for degree centrality - p-value: " + str(round(t_test_degree_centrality.pvalue, 4)))

In [None]:
t_test_eigenvector_centrality = scipy.stats.ttest_ind(male_eigen.eigenvector_centrality, female_eigen.eigenvector_centrality)
print("t-test for eigenvector centrality " + str(round(t_test_eigenvector_centrality.pvalue, 4)))