In [1]:
from loading import load_data_from_csv, filter_edgelist_and_nodes
from graphs import create_graph_with_attributes
from predictions import train_test_split, test_duplicator, predict_from_neighbors, ceating_dataframe_for_prediction, check_accuracy
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier 

In [2]:
# loading data
nodes, edgelist = load_data_from_csv(edgelist_path='data/edgelist.csv', node_path='data/node_data.csv')

Number of nodes in the graph: 434590
Number of edges in the graph: 3984828


In [3]:
#creating train and test samples. The test sample also contains the target variable, it is not used though.
# In order to create graph with all the nodes, an empty version of the test set is also created.
train, test = train_test_split(nodes)
empty_test = test_duplicator(test, target=['age', 'gender'])
train_test = pd.concat([train, empty_test]) # to create graph from

In [4]:
# filtering out edges that are not in the train set, also filtering out nodes without edges.
train_filt, edgelist_filt = filter_edgelist_and_nodes(train, edgelist)
# doing the same train_test dataset as well, that contains the nodes without info.
train_ext_filt, edgelist_ext_filt = filter_edgelist_and_nodes(train_test, edgelist)

In [5]:
# creating graphs for both node dataframe
G = create_graph_with_attributes(train, edgelist_filt)
G2 = create_graph_with_attributes(train_ext_filt, edgelist_ext_filt)

## Prediction based on neighbors

In [6]:
# prediction based on neighbors
predicted_df = predict_from_neighbors(G2, train_ext_filt, test)

100%|██████████| 43459/43459 [09:38<00:00, 75.10it/s] 


In [7]:
y_test = test.gender
y_pred = predicted_df.gender
check_accuracy(y_test, y_pred)

Confusion matrix:
[[ 9546 10951]
 [10618 12344]]

The accuracy score is 0.5036931360592742


## Prediction based on triads
For each node, the number of triangles and the average of the other two members' gender is calculated, that'll be used in the logistic regression and decision tree. 

Using only triads that have gender info for both other members.

In [8]:
# creating dataframe for regression, also for prediction
# TODO: remove [0:10000] limit, run again on the whole dataset
regression_df = ceating_dataframe_for_prediction(G, train)
test_df = ceating_dataframe_for_prediction(G2, train_ext_filt, test)

100%|██████████| 10000/10000 [09:34<00:00, 17.40it/s]
100%|██████████| 10000/10000 [12:45<00:00, 13.07it/s]


In [9]:
# creating train and test sets for logistic regression and decision tree
y_train, X_train = regression_df['gender'], regression_df[['mm', 'mf', 'ff']]
X_test =  test_df[['mm', 'mf', 'ff']]
y_test = test.gender[0:10000]

In [10]:
regmodel = LogisticRegression()
regmodel.fit(X_train, y_train)
y_pred_reg = regmodel.predict(X_test)

clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)

In [11]:
# evaluating the models
check_accuracy(y_test, y_pred_reg)

Confusion matrix:
[[1672 3017]
 [1023 4288]]

The accuracy score is 0.596


In [12]:
check_accuracy(y_test, y_pred_clf)

Confusion matrix:
[[1768 2921]
 [1219 4092]]

The accuracy score is 0.586


## Summary
Based on accuracy scores the both triad-based models are better for prediction than the benchmark.