In [None]:
from sklearn.metrics import log_loss
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd

predict_data = pd.read_csv('../data/training/predict/predict_data.csv')

positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

#replace inf distance with -1
data['distance'] = data['distance'].replace([float('inf'), -float('inf')], -1)
predict_data['distance'] = predict_data['distance'].replace([float('inf'), -float('inf')], -1)

#use ratio of abstract lengths instead of absolute lengths
data['abs_len_ratio'] = data['source_abstract_length'] / data['target_abstract_length']
predict_data['abs_len_ratio'] = predict_data['source_abstract_length'] / predict_data['target_abstract_length']

# Replace infinite values and clip large values in abs_len_ratio
data['abs_len_ratio'] = data['abs_len_ratio'].replace([float('inf'), -float('inf')], -1)
data['abs_len_ratio'] = data['abs_len_ratio'].clip(lower=-1e10, upper=1e10)
data['abs_len_ratio'] = data['abs_len_ratio'].fillna(-1)  # Fill NaN values with -1

predict_data['abs_len_ratio'] = predict_data['abs_len_ratio'].replace([float('inf'), -float('inf')], -1)
predict_data['abs_len_ratio'] = predict_data['abs_len_ratio'].clip(lower=-1e10, upper=1e10)
predict_data['abs_len_ratio'] = predict_data['abs_len_ratio'].fillna(-1)  # Fill NaN values with -1

X = data[[
    'source_corerank',
    'target_corerank',
    'source_pagerank',
    'target_pagerank',
    'source_k_core',
    'target_k_core',
    'distance',
    'dot_product',
    'euclidean_distance',
    'cosine_similarity',
    'abs_len_ratio',
    'source_abstract_length',
    'target_abstract_length',
    'jaccard_score'
]].values

y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

mlp = MLPClassifier(hidden_layer_sizes=(5, 5, 5), solver='adam', activation='relu', warm_start=True)
mlp.fit(X_train, y_train)
y_prob = mlp.predict_proba(X_test)[:, 1]

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Predicted probabilities:", y_prob)
print("Accuracy:", accuracy_score(y_test, mlp.predict(X_test)))
print("Precision:", precision_score(y_test, mlp.predict(X_test)))
print("Recall:", recall_score(y_test, mlp.predict(X_test)))
print("F1 Score:", f1_score(y_test, mlp.predict(X_test)))
print("Log Loss:", log_loss(y_test, y_prob))

X_predict = predict_data[[
    'source_corerank',
    'target_corerank',
    'source_pagerank',
    'target_pagerank',
    'source_k_core',
    'target_k_core',
    'distance',
    'dot_product',
    'euclidean_distance',
    'cosine_similarity',
    'abs_len_ratio',
    'source_abstract_length',
    'target_abstract_length',
    'jaccard_score'
]].values
X_predict = scaler.transform(X_predict)

res = mlp.predict_proba(X_predict)[:, 1]
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})

resdf[['ID', 'Label']].to_csv('../predictions/mlp_predictions.csv', index=False)

Predicted probabilities: [9.86795625e-01 3.82082058e-02 2.75350555e-02 4.63318149e-04
 4.62135406e-03 6.78704144e-03 9.96025999e-01 9.96166449e-01
 6.15290826e-01 9.98830852e-01 2.26985987e-01 9.90365231e-01
 1.34429247e-04 9.97251691e-01 1.17406073e-01 7.34158083e-02
 9.95601558e-01 2.90581154e-03 1.20993278e-02 4.68648331e-01
 1.17188589e-03 9.93495533e-01 4.38575326e-02 3.31801787e-02
 9.97471488e-01 9.95611130e-01 4.07763628e-03 7.91879482e-02
 5.92839421e-03 1.18038145e-04 4.43827196e-03 4.76342922e-02
 1.80195316e-04 9.89235237e-01 7.87778449e-03 9.98970907e-01
 9.98101114e-01 1.37322350e-03 2.99655620e-05 4.32193180e-03
 1.40970498e-03 6.33245714e-01 9.88905854e-01 4.88878094e-01
 9.98832283e-01 9.98956079e-01 3.63881964e-02 3.66666251e-04
 2.83163081e-02 2.98045151e-02 3.46912319e-02 9.93833161e-01
 1.85844685e-02 3.49225879e-01 5.81113695e-01 4.00722218e-01
 9.98850458e-01 4.55318093e-03 9.98829691e-01 2.39259018e-03
 1.55683595e-02 3.85255829e-03 3.31129901e-01 2.67322186e-04