In [37]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
# % matplotlib inline

---

## Company's Email Network

`node`: person at the company<br>
`edge`: at least one email has been sent between two people.

##### Node Attributes
`Department`: the department in the company which the person belongs to <br>
`ManagementSalary`: whether that person is receiving a management position salary.

In [31]:
G = nx.read_gpickle('email_prediction.txt')


## Salary Prediction

Identifying the people in the network with missing values for the node attribute `ManagementSalary` and predicting whether or not these individuals are receiving a management position salary.<br>
##### Returns the probability that the corresponding employee is receiving a management position salary.



In [68]:
def salary_predictions():
    
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.ensemble import RandomForestClassifier
    
    # DataFrame setup
    df = pd.DataFrame(index=G.nodes())
    df['Department'] = pd.Series(nx.get_node_attributes(G, 'Department'))
    df['ManagementSalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
    df['Clustering'] = pd.Series(nx.clustering(G))
    df['Betweeness'] = pd.Series(nx.betweenness_centrality(G, normalized=True))
    df['Degree'] = pd.Series(G.degree())
    df['Closeness'] = pd.Series(nx.closeness_centrality(G, normalized=True))
    df['PR'] = pd.Series(nx.pagerank(G))
    df['degree_centrality'] = pd.Series(nx.degree_centrality(G))
    df['closeness_centrality'] = pd.Series(nx.closeness_centrality(G))
    
    #Predicting
    Train_df = df[~pd.isnull(df['ManagementSalary'])]
    Test_df = df[pd.isnull(df['ManagementSalary'])]
    features =['Clustering', 'Betweeness', 'Degree',
           'Closeness', 'PR', 'degree_centrality', 'closeness_centrality']
    X_train = Train_df[features]
    y_train = Train_df['ManagementSalary']
    X_test = Test_df[features]
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=10).fit(X_train_scaled, y_train)
    predictions = (clf.predict_proba(X_test_scaled)[:,1]).tolist()
    return pd.Series(predictions,X_test.index)

salary_predictions()

1       0.016228
2       0.930000
5       1.000000
8       0.309424
14      0.036547
18      0.117443
27      0.062402
30      0.817143
31      0.367361
34      0.060000
37      0.020000
40      0.134173
45      0.020851
54      0.323044
55      0.530241
60      0.211002
62      1.000000
65      0.960000
77      0.051165
79      0.095565
97      0.010000
101     0.000000
103     0.456991
108     0.034227
113     0.100000
122     0.010000
141     0.268445
142     1.000000
144     0.000323
145     0.484155
          ...   
913     0.000000
914     0.000000
915     0.000000
918     0.010541
923     0.000000
926     0.025154
931     0.000323
934     0.000000
939     0.000000
944     0.000000
945     0.000000
947     0.010357
950     0.024244
951     0.000000
953     0.000323
959     0.000000
962     0.000000
963     0.148022
968     0.000357
969     0.002157
974     0.003874
984     0.000000
987     0.011969
989     0.000357
991     0.000357
992     0.000000
994     0.000000
996     0.0000

## New Connections Prediction
Identifying the edges in `future_connections` with missing values and predicting whether or not these edges will have a future connection.
##### Returns the probability of the corresponding edge being a future connection.

In [63]:
future_connections = pd.read_csv('Future_Connections.csv', index_col=0, converters={0: eval})


Unnamed: 0,Future Connection,Preferential Attachment,Common Neighbours,Jaccard Coefficient,Resource Allocation,Adamic Adar,cn_soundarajan_hopcroft,ra_soundarajan_hopcroft
"(6, 840)",0.0,2070,9,0.07377,0.136721,2.110314,9,0.0
"(4, 197)",0.0,3552,2,0.015504,0.008437,0.363528,2,0.0
"(620, 979)",0.0,28,0,0.0,0.0,0.0,0,0.0
"(519, 872)",0.0,299,2,0.060606,0.039726,0.507553,2,0.0
"(382, 423)",0.0,205,0,0.0,0.0,0.0,0,0.0


In [67]:
def new_connections_predictions():
        
    from sklearn.preprocessing import MinMaxScaler
    from sklearn.ensemble import RandomForestClassifier
    
    #DataFrame setup
    future_connections['Preferential Attachment'] = [i[2] for i in nx.preferential_attachment(G, future_connections.index)]
    future_connections['Common Neighbours'] = [len(list(nx.common_neighbors(G, x[0], x[1]))) for x in future_connections.index]
    future_connections['Jaccard Coefficient'] = [i[2] for i in nx.jaccard_coefficient(G, future_connections.index)]
    future_connections['Resource Allocation'] = [i[2] for i in nx.resource_allocation_index(G, future_connections.index)]
    future_connections['Adamic Adar'] = [i[2] for i in nx.adamic_adar_index(G, future_connections.index)]
    # community assignment
    for node in G.nodes():
        G.node[node]['community'] = G.node[node]['Department']
    future_connections['cn_soundarajan_hopcroft'] = [i[2] for i in nx.cn_soundarajan_hopcroft(G, future_connections.index)]   
    future_connections['ra_soundarajan_hopcroft'] = [i[2] for i in nx.ra_index_soundarajan_hopcroft(G, future_connections.index)]
    
    #Predictions
    Train_df = future_connections[~pd.isnull(future_connections['Future Connection'])]
    Test_df = future_connections[pd.isnull(future_connections['Future Connection'])]
    features =['Preferential Attachment', 'Common Neighbours',
       'Jaccard Coefficient', 'Resource Allocation', 'Adamic Adar',
       'cn_soundarajan_hopcroft', 'ra_soundarajan_hopcroft']
    X_train = Train_df[features]
    y_train = Train_df['Future Connection']
    X_test = Test_df[features]
    scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    clf = RandomForestClassifier(n_estimators=100, max_depth=10).fit(X_train_scaled, y_train)
    predictions = (clf.predict_proba(X_test_scaled)[:,1]).tolist()
    return pd.Series(predictions,X_test.index)

new_connections_predictions()

(107, 348)    0.027101
(542, 751)    0.012506
(20, 426)     0.577256
(50, 989)     0.012506
(942, 986)    0.012840
(324, 857)    0.012593
(13, 710)     0.131673
(19, 271)     0.147741
(319, 878)    0.012456
(659, 707)    0.012533
(49, 843)     0.012605
(208, 893)    0.012392
(377, 469)    0.014032
(405, 999)    0.016320
(129, 740)    0.012518
(292, 618)    0.018848
(239, 689)    0.012371
(359, 373)    0.016191
(53, 523)     0.419443
(276, 984)    0.012333
(202, 997)    0.012554
(604, 619)    0.032482
(270, 911)    0.012456
(261, 481)    0.060907
(200, 450)    0.987796
(213, 634)    0.012474
(644, 735)    0.032666
(346, 553)    0.012414
(521, 738)    0.011816
(422, 953)    0.014594
                ...   
(672, 848)    0.012456
(28, 127)     0.990988
(202, 661)    0.012414
(54, 195)     0.999994
(295, 864)    0.012523
(814, 936)    0.012419
(839, 874)    0.012840
(139, 843)    0.012593
(461, 544)    0.013818
(68, 487)     0.013216
(622, 932)    0.012615
(504, 936)    0.017355
(479, 528) 