In [6]:

#The data set can be downloaded from the following link:
# https://drive.google.com/file/d/0B5ok9oCZslVoYXVDM1hUakhRWWM/view?usp=sharing

# Please download the data and unzip into the curret=nt directory.

#Please look at the following link for the explantion of the dataset.
# https://raw.githubusercontent.com/machadob/DATA_620_Web_Analytics/master/DATA_620_Assignment_04

# We will be scrubbing the dowmloaded datasets, using python, to get the nodes, edges and the attributes.
# The way the files are formatted is very raw and we will have to process, restructure the data and clean it before we can load it into a graph network.
# We will also need to truncate the data if it seems too large to fit into memory.
#
# We have many categorical variables in this data set.
# The hypothetical outcome is a prediction that students who have marketing as their major will have a higher degree of centrality than students with other majors.
# This is based on  the observation that people in marketing tend to be more adept at making new contacts and also tend to become leaders.
# We will check if this hypothesis is true or not.

# We will first filter out the people in Marketing and Engineering and treat them as two seperate networks.
#Marketing has a attribute of -13 and Engineering is -11
# The file direct_social_structure.txt in the dataset has the node to node connections. Each line in that
# file has two entries specifying the nodes that are connected. The file is 8GB in size and has 462994069 entries.
# We will truncate the file and use the first 10000 entries for our analysis.

from sets import Set
import networkx as nx
import io
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd

#Download and unzip the dataset to the baseDir. Set the baseDir according to your machine.
baseDir = '/Users/burton/000-Semester_06_CUNY/620_Web_Analytics/Week_04/imc12/'

#We will be creating intermidiate files to clean and process the datasets.

# The function below will truncate the inFile to numberOfLines and write it to outFile
def truncateFile(inFile, outFile,  numberOfLines):
    fOutFile = open(outFile,'w')
    lineCount = 0
    with open(inFile) as fInFile:
        for line in fInFile:
            fOutFile.write(line)
            lineCount = lineCount + 1
            if(lineCount >= numberOfLines):
                break
    fOutFile.close() # you can omit in most cases as the destructor will call it

inFile = baseDir + 'direct_social_structure.txt'
outfile = baseDir + 'relations.txt'
truncateFile(inFile, outfile, 10000)

# We will now extract the ids for the 'major' attribute for all nodes.
attrMajor = []
with open(baseDir + 'attri_type.txt') as fInFile:
    for line in fInFile:
        if(line.__contains__('major')):
            tokens = line.split(' ')
            attrMajor.append(tokens[0])
            # print(tokens[0])
# print(attrMajor)
print("attrMajor size : " +  str(attrMajor.__len__()))



attrMajor size : 270970


In [7]:
# Grab all the ids for 'marketing' majors from the attribute dictionary using the previously saved ids for the majors.
marketing_nodes = []
MAX_NODES = 10
nodeCount = 0
with open(baseDir + 'attri_dic.txt') as fInFile:
    for line in fInFile:
        tokens = line.split(' ')
        if tokens[0] in attrMajor:
            # print(line)
            if(tokens[1].lower().__contains__("marketing")):
                print(line)
                print(tokens[0])
                marketing_nodes.append(tokens[0])
                nodeCount = nodeCount + 1
                if(nodeCount >= MAX_NODES):
                    break
print(marketing_nodes)


-13 "marketing"

-13
-437 "marketing and information systems"

-437
-1450 "marketing management"

-1450
-2814 "marketing & finance"

-2814
-4017 "marketing / internet development"

-4017
-4860 "marketing and technological innovation"

-4860
-6247 "marketing, services and communication management"

-6247
-7503 "marketing and management"

-7503
-7569 "marketing / entertainment, media & technology"

-7569
-7659 "marketing / systems"

-7659
['-13', '-437', '-1450', '-2814', '-4017', '-4860', '-6247', '-7503', '-7569', '-7659']


In [8]:
# Grab all the ids for 'engineering' majors from the attribute dictionary using the previously saved ids for the majors.
engineering_nodes = []
nodeCount = 0
with open(baseDir + 'attri_dic.txt') as fInFile:
    for line in fInFile:
        tokens = line.split(' ')
        if tokens[0] in attrMajor:
            # print(line)
            if(tokens[1].lower().__contains__("engineering")):
                print(line)
                # print(tokens[0])
                engineering_nodes.append(tokens[0])
                nodeCount = nodeCount + 1
                if(nodeCount >= MAX_NODES):
                    break
print(engineering_nodes)



-11 "engineering"

-3675 "engineering and industrial technology"

-3755 "engineering, computer science"

-3838 "engineering management"

-4688 "engineering-graduation"

-6391 "bioengineering"

-6599 "engineering science"

-6732 "bioengineering: biotechnology"

-7238 "engineering physics"

-7454 "engineering, management, public policy"

['-11', '-3675', '-3755', '-3838', '-4688', '-6391', '-6599', '-6732', '-7238', '-7454']


In [9]:
# Grab all the ids for 'marketing' majors from the attribute dictionary using the previously saved ids for the majors.
marketing_nodes = []
MAX_NODES = 10
nodeCount = 0
with open(baseDir + 'attri_dic.txt') as fInFile:
    for line in fInFile:
        tokens = line.split(' ')
        if tokens[0] in attrMajor:
            # print(line)
            if(tokens[1].lower().__contains__("marketing")):
                print(line)
                print(tokens[0])
                marketing_nodes.append(tokens[0])
                nodeCount = nodeCount + 1
                if(nodeCount >= MAX_NODES):
                    break
print(marketing_nodes)


-13 "marketing"

-13
-437 "marketing and information systems"

-437
-1450 "marketing management"

-1450
-2814 "marketing & finance"

-2814
-4017 "marketing / internet development"

-4017
-4860 "marketing and technological innovation"

-4860
-6247 "marketing, services and communication management"

-6247
-7503 "marketing and management"

-7503
-7569 "marketing / entertainment, media & technology"

-7569
-7659 "marketing / systems"

-7659
['-13', '-437', '-1450', '-2814', '-4017', '-4860', '-6247', '-7503', '-7569', '-7659']


In [10]:
#Grab all the marketing nodes from the node-attributes (node_attri.txt) mapping file and marketing_nodes previosly created
marketing_node_ids = []
nodeCount = 0
with open(baseDir + 'node_attri.txt') as fInFile:
    for line in fInFile:
        if(nodeCount >= 100):
            break
        tokens = line.split(' ')
        if tokens[1] in marketing_nodes:
            marketing_node_ids.append(tokens[0])
            nodeCount = nodeCount + 1

            # print(tokens[0])
            # print(line)
unique_marketing_node_ids = Set(marketing_node_ids)
print(len(unique_marketing_node_ids))


100


In [11]:
#Grab all the engineering nodes from the node-attributes (node_attri.txt) mapping file and engineering_nodes previosly created
engineering_node_ids = []
nodeCount = 0
with open(baseDir + 'node_attri.txt') as fInFile:
    for line in fInFile:
        if(nodeCount >= 100):
            break
        tokens = line.split(' ')
        if tokens[1] in engineering_nodes:
            engineering_node_ids.append(tokens[0])
            nodeCount = nodeCount + 1
            # print(tokens[0])
unique_engineering_node_ids = Set(engineering_node_ids)
print(len(unique_engineering_node_ids))


100


In [None]:
# Write out the filtered marketing nodes to the file filtered_marketing_network.txt
# and the filtered engineering nodes to the file filtered_engineering_network.txt
fOutMarketing = open(baseDir + 'filtered_marketing_network.txt' ,'w')
fOutEngineering = open(baseDir + 'filtered_engineering_network.txt' ,'w')
with open(baseDir + 'direct_social_structure.txt') as fInFile:
    for line in fInFile:
        tokens = line.split(' ')
        if tokens[0] in unique_marketing_node_ids:
            fOutMarketing.write(line)
        if tokens[0] in unique_engineering_node_ids:
            fOutEngineering.write(line)
fOutMarketing.close()
fOutEngineering.close()




In [13]:
# We will load a network graphs of Marketing and Engineering people
marketing_graph = nx.read_weighted_edgelist(baseDir + 'filtered_marketing_network.txt', delimiter=' ', create_using=nx.DiGraph())
engineering_graph = nx.read_weighted_edgelist(baseDir + 'filtered_engineering_network.txt', delimiter=' ', create_using=nx.DiGraph())

degreeCentrality_Marketing = nx.degree_centrality(marketing_graph)
degreeCentrality_Engineering = nx.degree_centrality(engineering_graph)
print(np.mean(degreeCentrality_Marketing.values()))
print(np.mean(degreeCentrality_Engineering.values()))
count_marketing = len(degreeCentrality_Marketing.values())
count_engineering = len(degreeCentrality_Engineering.values())



0.000316701134323
0.00035656635195


In [14]:
#We will see if the differences in the mean of the degree centrality is significant.
# Null hypothesis (H0): There is no significant differences in the means of Marketing and Engineering people.

# Null hypothesis (HA): There is a significant difference in the means of Marketing and Engineering people.

mean_DegreeCentrality_Marketing = np.mean(degreeCentrality_Marketing.values())
mean_DegreeCentrality_Engineering = np.mean(degreeCentrality_Engineering.values())
mean_difference = mean_DegreeCentrality_Marketing - mean_DegreeCentrality_Engineering

variance_DegreeCentrality_Marketing = np.var(degreeCentrality_Marketing.values())
variance_DegreeCentrality_Engineering = np.var(degreeCentrality_Engineering.values())

#We will take the average of both the variances for the t test
average_variance  = (variance_DegreeCentrality_Marketing + variance_DegreeCentrality_Engineering)/2
print(average_variance)
# sample_size =
standard_error = (2*average_variance/len(degreeCentrality_Marketing.values()))**.5
#
t = ((mean_DegreeCentrality_Marketing - mean_DegreeCentrality_Engineering) - 0)/standard_error

# The t value comes out to -1.2 with degrees of freedom greater than 5000 (essentially infinite)
# The p value is 0.2 which is much greater than 0.05, so we do not reject the null hypothesis

eigenvectorCentrality_Marketing = nx.eigenvector_centrality(marketing_graph)
eigenvectorCentrality_Engineering = nx.eigenvector_centrality(engineering_graph)
print(np.mean(eigenvectorCentrality_Marketing.values()))
print(np.mean(eigenvectorCentrality_Engineering.values()))

#There is no differeneces between the eigenvector centrality of the two groups, so we do not need a t test to proove that.

# In conclusion, we did not find any significant differences between the Marketing and Engineering groups in terms of Degree Centrality and
# Eigenvector centrality



4.06589430309e-06
0.0
0.0
