In [9]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import  StandardScaler
import pandas as pd
predict_data = pd.read_csv('../data/training/predict/predict_data.csv')

positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

#replace inf distance with -1
data['distance'] = data['distance'].replace([float('inf'), -float('inf')], -1)
predict_data['distance'] = predict_data['distance'].replace([float('inf'), -float('inf')], -1)


# Use cosine similarity and abstract lengths as features
X = data[['distance','dot_product','euclidean_distance','cosine_similarity', 'source_abstract_length',
          'target_abstract_length','jaccard_score']].values  # Features: similarity and abstract lengths
y = data['label']  # Target: all positive examples

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the MLP
mlp = MLPClassifier(hidden_layer_sizes=(5,5,5), solver='adam', activation='relu', max_iter=1000)
mlp.fit(X_train, y_train)

# Predict probabilities
y_prob = mlp.predict_proba(X_test)[:, 1]  # Probability of class 1 (citation)

# Print the probabilities
print("Predicted probabilities:", y_prob)

X_predict = predict_data[
    ['distance','dot_product','euclidean_distance','cosine_similarity', 'source_abstract_length',
     'target_abstract_length','jaccard_score']].values  # Features: similarity and abstract lengths
X_predict = scaler.transform(X_predict)  # Standardize the features using the same scaler
# Predict probabilities
res = mlp.predict_proba(X_predict)[:, 1]  # Probability of class 1 (citation)
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
# Rename the 'index' column to 'ID'
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})

# Save the updated DataFrame to a CSV file
resdf[['ID', 'Label']].to_csv('../predictions/predictions.csv', index=False)

Predicted probabilities: [0.97572401 0.03611321 0.98860035 ... 0.02223196 0.79974549 0.13997358]
