# Analysing predictions on a network with its metrics and node atributes

In [2]:
import networkx as nx
import pandas as pd
import numpy as np
import pickle

---

## Data: Company Emails

Data is a company's email network where each node corresponds to a person at the company, and each edge indicates that at least one email has been sent between two people.

The network also contains the node attributes Department and ManagmentSalary.

In [3]:
G = pickle.load(open('assets/email_prediction_NEW.txt', 'rb'))

print(f"Graph with {len(nx.nodes(G))} nodes and {len(nx.edges(G))} edges")

Graph with 1005 nodes and 16706 edges


### Salary Prediction

The next code predicts salary of employees wtih nan data.


Score used will be AUC

In [15]:
list(G.nodes(data=True))[:5] # print the first 5 nodes

[(0, {'Department': 1, 'ManagementSalary': 0.0}),
 (1, {'Department': 1, 'ManagementSalary': nan}),
 (581, {'Department': 3, 'ManagementSalary': 0.0}),
 (6, {'Department': 25, 'ManagementSalary': 1.0}),
 (65, {'Department': 4, 'ManagementSalary': nan})]

In [95]:


def salary_predictions():
    from sklearn.preprocessing import StandardScaler
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    from sklearn.model_selection import GridSearchCV

# Define the parameter grid
    param_grid = {
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': ['sqrt', 'log2', None]
}
    df = pd.DataFrame(index=G.nodes)
    df['Department'] = pd.Series(nx.get_node_attributes(G, 'Department'))
    df['ManagementSalary'] = pd.Series(nx.get_node_attributes(G, 'ManagementSalary'))
    scaler = StandardScaler()
    df["DepartmentNorm"] = scaler.fit_transform(df['Department'].values.reshape(-1, 1)).flatten()
    df['degree'] =pd.Series(nx.degree_centrality(G))
    df['closeness'] =pd.Series(nx.closeness_centrality(G))
    
    # YOUR CODE HERE
    df_nan=df[df["ManagementSalary"].isnull()]
    df_train=df[df["ManagementSalary"].notnull()]
    #engagement_model(df_train, "DepartmentNorm", "ManagementSalary")
    X=np.stack((df_train["DepartmentNorm"].values, df_train['degree'].values, df_train["closeness"].values), axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X, df_train["ManagementSalary"].values)
    #clf = RandomForestClassifier()
    clf = RandomForestClassifier().fit(X_train, y_train)
    
    X_nan=np.stack((df_nan["DepartmentNorm"].values, df_nan['degree'].values, df_nan["closeness"].values), axis=1)
    df_nan["ManagementSalary"]=clf.predict_proba(X_nan)[:,1]
#     grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='roc_auc', cv=3)
#     grid_search.fit(X, df_train["ManagementSalary"].values)
#     #'max_depth': 3, 'max_features': 'log2', 'min_samples_leaf': 5, 'min_samples_split': 2}

#     # Best parameters and score
#     print("Best Parameters:", grid_search.best_params_)
#     print("Best Cross-Validated Score:", grid_search.best_score_)
    
    return df_nan["ManagementSalary"]#roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
salary_predictions()

1      0.01
65     0.73
18     0.15
215    0.45
283    1.00
       ... 
691    0.00
788    0.13
944    0.00
798    0.00
808    0.00
Name: ManagementSalary, Length: 252, dtype: float64

###  New Connections Prediction

Next code will predict future connections between employees of the network. The future connections information has been loaded into the variable `future_connections`. The index is a tuple indicating a pair of nodes that currently do not have a connection, and the `Future Connection` column indicates if an edge between those two nodes will exist in the future, where a value of 1.0 indicates a future connection.

In [5]:
future_connections = pd.read_csv('assets/Future_Connections.csv', index_col=0, converters={0: eval})
future_connections.head(10)

Unnamed: 0,Future Connection
"(6, 840)",0.0
"(4, 197)",0.0
"(620, 979)",0.0
"(519, 872)",0.0
"(382, 423)",0.0
"(97, 226)",1.0
"(349, 905)",0.0
"(429, 860)",0.0
"(309, 989)",0.0
"(468, 880)",0.0


In [43]:
def stack(df, features):
    return tuple(df[feature].values for feature in features)
def new_connections_predictions():
    from sklearn.preprocessing import StandardScaler, LabelEncoder
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import roc_auc_score
    df=future_connections
    #df["common_n"]=
    
    hopcroft=nx.ra_index_soundarajan_hopcroft(G, community="Department")
    #hopcroft=nx.resource_allocation_index(G)
    arr_hopcroft=np.array(list(hopcroft))
    idx=[(int(a[0]),int(a[1])) for a in arr_hopcroft]
    hop=arr_hopcroft[:,2]
    df["hopcroft"]=pd.Series(hop, index=idx)
    
    #close_n=[(e[0],e[1],len(list(nx.common_neighbors(G, e[0], e[1])))) for e in nx.non_edges(G)]
    close_n=list(nx.cn_soundarajan_hopcroft(G, community="Department"))
    arr_cn=np.array(close_n)
    idx=[(int(a[0]),int(a[1])) for a in arr_cn]
    cn=arr_cn[:,2]
    df["cn"]=pd.Series(cn, index=idx)
    
    fc="Future Connection"
    df_nan=df[df[fc].isnull()]
    df_train=df[df[fc].notnull()]
    # YOUR CODE HERE
    X=np.stack(stack(df_train, ["hopcroft", "cn"]), axis=1)
    y=df_train["Future Connection"].values
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    #clf = RandomForestClassifier()
    clf = RandomForestClassifier().fit(X_train, y_train)
    
    X_nan=np.stack(stack(df_nan, ["hopcroft", "cn"]), axis=1)
    df_nan["Future Connection"]=clf.predict_proba(X_nan)[:,1]
    
    return df_nan["Future Connection"]#roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])
new_connections_predictions()

(107, 348)    0.026732
(542, 751)    0.012729
(20, 426)     0.610093
(50, 989)     0.012729
(942, 986)    0.012729
                ...   
(165, 923)    0.012729
(673, 755)    0.012729
(939, 940)    0.012729
(555, 905)    0.012729
(75, 101)     0.016281
Name: Future Connection, Length: 122112, dtype: float64

In [44]:
ans_prob_preds = new_connections_predictions()
assert type(ans_prob_preds) == pd.core.series.Series, "You must return a Pandas series"
assert len(ans_prob_preds) == 122112, "The series must be of length 122112"
