In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score, roc_curve, \
    roc_auc_score, confusion_matrix, ConfusionMatrixDisplay, balanced_accuracy_score
import pandas as pd
import networkx as nx
from tqdm import tqdm
import multiprocessing as mp

In [None]:
friends = pd.read_csv('processed/friends.csv')
links_train = pd.read_csv('processed/links_train.csv')

In [None]:
friends = pd.read_csv('processed/friends.csv')
links_train = pd.read_csv('processed/links_train.csv')

filtered_data = links_train[links_train['is_friends'] == 1]

G = nx.Graph()

G.add_edges_from(filtered_data[['user1', 'user2']].values)

In [None]:
G = nx.Graph()

G.add_edges_from(filtered_data[['user1', 'user2']].values)

In [None]:
def common_neighbors(user1, user2):
    try:
        return len(list(nx.common_neighbors(G, user1, user2)))
    except nx.NetworkXError:
        return 0


def resource_allocation_index(user1, user2):
    try:
        preds = next(nx.resource_allocation_index(G, [(user1, user2)]))
        u, v, p = preds
        return p
    except nx.NetworkXError:
        return 0


def jaccard_coefficient(user1, user2):
    try:
        preds = next(nx.jaccard_coefficient(G, [(user1, user2)]))
        u, v, p = preds
        return p
    except KeyError:
        return 0


def adamic_adar_index(user1, user2):
    try:
        preds = next(nx.adamic_adar_index(G, [(user1, user2)]))
        u, v, p = preds
        return p
    except nx.NetworkXError:
        return 0


def preferential_attachment(user1, user2):
    try:
        preds = next(nx.preferential_attachment(G, [(user1, user2)]))
        u, v, p = preds
        return p
    except nx.NetworkXError:
        return 0


def has_path(user1, user2):
    return (user1 in G) and (user2 in G)

## Other features

In [None]:
profiles = pd.read_csv('processed/profiles.csv')
posts = pd.read_csv('processed/posts.csv')

In [None]:
import ast


def parse_list_string(list_string):
    try:
        return ast.literal_eval(list_string)
    except:
        return []

profiles_clean = pd.DataFrame()

cols = ['schools', 'universities', 'faculties']

profiles_clean['id'] = profiles['id']
profiles_clean['city'] = profiles['city']
profiles_clean['sex'] = profiles['sex']

for col in cols:
    profiles_clean[col] = profiles[col].apply(parse_list_string)
    profiles_clean[col] = profiles_clean[col].map(lambda arr: list(filter(lambda el: el is not None, arr)))

In [None]:
profiles_clean = profiles_clean.fillna(2)

In [None]:
profiles_clean.isnull().sum()

In [None]:
posts = posts.fillna(0)

In [None]:
def shared_elements(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    return len(set1.intersection(set2))

In [None]:
from tqdm import tqdm
tqdm.pandas()
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

In [None]:
def sex_relation(sex1, sex2):
    if sex1 == 1 and sex2 == 1:
        return 0
    elif sex1 == 1 and sex2 == 2:
        return 1
    elif sex1 == 2 and sex2 == 1:
        return 2
    elif sex1 == 2 and sex2 == 2:
        return 3
    else:
        return 4

In [None]:
links_train_features = links_train.copy()

links_train_features = links_train_features.merge(profiles_clean, left_on='user1', right_on='id', how='left')
links_train_features = links_train_features.merge(profiles_clean, left_on='user2', right_on='id', suffixes=('_user1', '_user2'), how='left')

links_train_features['shared_schools'] = links_train_features.parallel_apply(lambda row: shared_elements(row['schools_user1'], row['schools_user2']), axis=1)
links_train_features['shared_universities'] = links_train_features.parallel_apply(lambda row: shared_elements(row['universities_user1'], row['universities_user2']), axis=1)
links_train_features['shared_faculties'] = links_train_features.parallel_apply(lambda row: shared_elements(row['faculties_user1'], row['faculties_user2']), axis=1)
links_train_features['same_sex'] = links_train_features.parallel_apply(lambda row: sex_relation(row['sex_user1'], row['sex_user2']), axis=1).astype(int)
links_train_features['same_city'] = (links_train_features['city_user1'] == links_train_features['city_user2']).astype(int)

In [None]:
cols = ['user1', 'user2', 'shared_schools', 'shared_universities', 'shared_faculties', 'same_city', 'same_sex']

links_train_features = links_train_features[cols]

In [None]:
links_train_features.head()

In [None]:
def calculate_similarity(user_pair):
    u1, u2 = user_pair
    return (
        u1,
        u2,
        common_neighbors(u1, u2),
        resource_allocation_index(u1, u2),
        jaccard_coefficient(u1, u2),
        adamic_adar_index(u1, u2),
        preferential_attachment(u1, u2),
        has_path(u1, u2),
    )

In [None]:
records = []
users = sorted(set(friends.user1.tolist()))
user_pairs = [(u1, u2) for i, u1 in enumerate(users) for j, u2 in enumerate(users) if j > i]

In [None]:
with mp.Pool(mp.cpu_count() // 2) as pool:
    results = list(
        tqdm(
            pool.imap_unordered(
                calculate_similarity, 
                user_pairs, 
                chunksize=(len(user_pairs) // 50)
            ), 
            total=len(user_pairs)
        )
    )

# Collect the results
graph_features = pd.DataFrame(results, columns=['user1', 'user2', 'common_neighbors', 'resource_allocation', 'jaccard_coefficient', 'adamic_adar', 'preferential_attachment', 'has_path'])

In [None]:
# df = pd.merge(graph_features, links_train_features,  how='inner', on=['user1','user2'])
df = graph_features

In [None]:
train = pd.merge(links_train, df,  how='inner', on=['user1', 'user2'])

In [None]:
target = 'is_friends'

In [None]:
X = train.drop(columns=[target])
y = train[target]

In [None]:
from sklearn.model_selection import train_test_split
 
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y
)

In [None]:
y_train.value_counts()

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from catboost import CatBoostClassifier, Pool

classes = np.unique(y_train)
weights = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weights = dict(zip(classes, weights))

model = CatBoostClassifier(
    iterations=1000,
    depth=4,
    learning_rate=0.001,
    l2_leaf_reg=0.1,
    task_type="GPU",
    devices="0"
)

model.fit(
    X_train, y_train, eval_set=(X_val, y_val), use_best_model=True, 
    plot=True, verbose=False
)

## Check

In [None]:
preds_train = model.predict_proba(X_train)
gt_train = y_train

preds_val = model.predict_proba(X_val)
gt_val = y_val

In [None]:
balanced_accuracy_score(gt_train, preds_train.argmax(axis=1))

In [None]:
balanced_accuracy_score(gt_val, preds_val.argmax(axis=1))

In [None]:
# params = dict(average='weighted')
params = dict()

print(f'prec train: {precision_score(gt_train, preds_train.argmax(axis=1), **params):.3f} valid: {precision_score(gt_val, preds_val.argmax(axis=1), **params):.3f}')
print(f'recall train: {recall_score(gt_train, preds_train.argmax(axis=1), **params):.3f} valid: {recall_score(gt_val, preds_val.argmax(axis=1), **params):.3f}')
print(f'f1 train: {f1_score(gt_train, preds_train.argmax(axis=1), **params):.3f} valid: {f1_score(gt_val, preds_val.argmax(axis=1), **params):.3f}')
print(f'roc auc train: {roc_auc_score(gt_train, preds_train[:, 1], **params):.3f} valid: {roc_auc_score(gt_val, preds_val[:, 1], **params):.3f}')

print(f'\nbalanced acc train: {balanced_accuracy_score(gt_train, preds_train.argmax(axis=1)):.3f} valid: {balanced_accuracy_score(gt_val, preds_val.argmax(axis=1)):.3f}')

In [None]:
fpr, tpr, tr = roc_curve(y_train, preds_train[:, 1])
_ = plt.plot(fpr, tpr)
_ = plt.plot((0, 1), (0, 1))

In [None]:
fpr, tpr, tr = roc_curve(y_val, preds_val[:, 1])
_ = plt.plot(fpr, tpr)
_ = plt.plot((0, 1), (0, 1))

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_val, preds_val.argmax(axis=1))).plot()

In [None]:
ConfusionMatrixDisplay(confusion_matrix(y_train, preds_train.argmax(axis=1))).plot()

In [None]:
import shap

X, y = X_val, y_val

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
def print_stats(model, border):
    preds_train = model.predict(X_train)
    preds_val = model.predict(X_val)

    train_score = balanced_accuracy_score(y_train, preds_train)
    val_score = balanced_accuracy_score(y_val, preds_val)
    print(f'{border} {train_score} {val_score}')

In [None]:
# for border in np.arange(0.1, 1, 0.1):
#     model.set_probability_threshold(border)
#     print_stats(model, border)

## Predict

In [None]:
features = df.columns.tolist()

In [None]:
links_test = pd.read_csv('processed/links_test.csv')
test = links_test.merge(df, on=['user1', 'user2'], how='inner')

test_pred = model.predict_proba(test[features]).argmax(axis=1)
print('friends percentage: ', sum(test_pred) / len(test_pred))
print('friends count: ', sum(test_pred))
test_pred = test[['user1', 'user2']].assign(is_friends=test_pred)
test_pred.assign(ID=list(range(len(test_pred))))[['ID', 'is_friends']].to_csv('pred.csv', index=False)