In [2]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import pandas as pd

predict_data = pd.read_csv('../data/training/predict/predict_data.csv')

positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

#replace inf distance with -1
data['distance'] = data['distance'].replace([float('inf'), -float('inf')], -1)
predict_data['distance'] = predict_data['distance'].replace([float('inf'), -float('inf')], -1)

X = data[['distance',
          'dot_product',
          'euclidean_distance',
          'cosine_similarity',
          'source_abstract_length',
          'target_abstract_length',
          'jaccard_score']
        ].values

y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the MLP
mlp = GaussianNB()
mlp.fit(X_train, y_train)
y_prob = mlp.predict_proba(X_test)[:, 1]  # Probability of class 1 (citation)


from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print("Predicted probabilities:", y_prob)
print("Accuracy:", accuracy_score(y_test, mlp.predict(X_test)))
print("Precision:", precision_score(y_test, mlp.predict(X_test)))
print("Recall:", recall_score(y_test, mlp.predict(X_test)))
print("F1 Score:", f1_score(y_test, mlp.predict(X_test)))

X_predict = predict_data[['distance',
                          'dot_product',
                          'euclidean_distance',
                          'cosine_similarity',
                          'source_abstract_length',
                          'target_abstract_length',
                          'jaccard_score']
                        ].values
X_predict = scaler.transform(X_predict)

res = mlp.predict_proba(X_predict)[:, 1]
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})

resdf[['ID', 'Label']].to_csv('../predictions/nb_predictions.csv', index=False)

Predicted probabilities: [7.02524449e-01 7.04602953e-03 7.63436026e-01 ... 1.97974744e-07
 3.47769742e-01 8.37129638e-03]
Accuracy: 0.8846358064530895
Precision: 0.990243275455119
Recall: 0.7766724250908441
F1 Score: 0.8705504693177879
