In [11]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd


positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
#add a column with number of common authors
positive_data['num_of_common_authors'] = positive_data.apply(
    lambda row: len(set(row['source_authors'].split(',')).intersection(set(row['target_authors'].split(',')))), axis=1)
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0
negative_data['num_of_common_authors'] = positive_data.apply(
    lambda row: len(set(row['source_authors'].split(',')).intersection(set(row['target_authors'].split(',')))), axis=1)

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data.head()

Unnamed: 0,source,target,cosine_similarity,source_abstract,target_abstract,source_abstract_length,target_abstract_length,source_authors,target_authors,source_num_authors,target_num_authors,num_of_common_authors,label
0,103793,91378,0.0,The main characteristic of collaborative filte...,,131,0,"Yin-Fu Huang,San-Des Lin","Junting Ye,Steven Skiena",2,2,0.0,0
1,28572,81931,0.105705,As a step towards developing zero-shot task ge...,In addition to identifying the content within ...,150,154,"Junhyuk Oh,Satinder Singh,Honglak Lee,Pushmeet...","Scott Reed,Yi Zhang,Yuting Zhang,Honglak Lee",4,4,1.0,1
2,89777,89777,1.0,Clustering high dimensional data is a big chal...,Clustering high dimensional data is a big chal...,122,122,"Lifei Chen,Qingshan Jiang,Shengrui Wang","Lifei Chen,Qingshan Jiang,Shengrui Wang",3,3,,0
3,36023,36115,0.224333,"We trained a large, deep convolutional neural ...","Previously, no-reference (NR) stereoscopic 3D ...",150,243,"Alex Krizhevsky,Ilya Sutskever,Geoffrey E. Hinton","Heeseok Oh,Sewoong Ahn,Jongyoo Kim,Sanghoon Lee",3,4,0.0,1
4,35522,38898,0.400628,We present a novel and flexible approach to th...,Variable and feature selection have become the...,208,114,"Simon Perkins,Kevin Lacker,James Theiler","Isabelle Guyon,André Elisseeff",3,2,0.0,1


In [12]:
data['num_of_common_authors'] = data['num_of_common_authors'].fillna(0)
data.head(20)

Unnamed: 0,source,target,cosine_similarity,source_abstract,target_abstract,source_abstract_length,target_abstract_length,source_authors,target_authors,source_num_authors,target_num_authors,num_of_common_authors,label
0,103793,91378,0.0,The main characteristic of collaborative filte...,,131,0,"Yin-Fu Huang,San-Des Lin","Junting Ye,Steven Skiena",2,2,0.0,0
1,28572,81931,0.105705,As a step towards developing zero-shot task ge...,In addition to identifying the content within ...,150,154,"Junhyuk Oh,Satinder Singh,Honglak Lee,Pushmeet...","Scott Reed,Yi Zhang,Yuting Zhang,Honglak Lee",4,4,1.0,1
2,89777,89777,1.0,Clustering high dimensional data is a big chal...,Clustering high dimensional data is a big chal...,122,122,"Lifei Chen,Qingshan Jiang,Shengrui Wang","Lifei Chen,Qingshan Jiang,Shengrui Wang",3,3,0.0,0
3,36023,36115,0.224333,"We trained a large, deep convolutional neural ...","Previously, no-reference (NR) stereoscopic 3D ...",150,243,"Alex Krizhevsky,Ilya Sutskever,Geoffrey E. Hinton","Heeseok Oh,Sewoong Ahn,Jongyoo Kim,Sanghoon Lee",3,4,0.0,1
4,35522,38898,0.400628,We present a novel and flexible approach to th...,Variable and feature selection have become the...,208,114,"Simon Perkins,Kevin Lacker,James Theiler","Isabelle Guyon,André Elisseeff",3,2,0.0,1
5,25812,39486,0.573884,Thumbnails play such an important role in onli...,"In video captioning task, the best practice ha...",210,180,"Yale Song,Miriam Redi,Jordi Vallmitjana,Alejan...","Yangyu Chen,Shuhui Wang,Weigang Zhang,Qingming...",4,4,0.0,1
6,19595,24813,0.236664,Face recognition performance degrades signific...,We present a new system for biometric recognit...,211,148,"Muhammad Uzair,Arif Mahmood,Ajmal Mian,Chris M...","Fernando Alonso-Fernandez,Josef Bigun",4,2,0.0,1
7,46502,46502,1.0,Real-world datasets often have representations...,Real-world datasets often have representations...,151,151,"Hao Wang,Yan Yang,Tianrui Li","Hao Wang,Yan Yang,Tianrui Li",3,3,0.0,0
8,109664,4817,0.0,The paper presents a mixed signal CMOS feedfor...,,90,0,"J. Liu,M.A. Brooke,K. Hirotsu","Wanzheng Zhu,Chao Zhang,Shuochao Yao,Xiaobin G...",3,5,0.0,0
9,7288,82073,0.187624,Two of the most critical requirements in suppo...,"This paper presents eigenPulse, a new method f...",118,162,"P.J. Phillips,Hyeonjoon Moon,S.A. Rizvi,P.J. R...","John M. Irvine,Steven A. Israel,W. Todd Scrugg...",4,4,0.0,1


In [None]:

# Use cosine similarity and abstract lengths as features
X = data[['num_of_common_authors','source_num_authors','target_num_authors','cosine_similarity', 'source_abstract_length',
          'target_abstract_length']].values  # Features: similarity and abstract lengths
y = data['label']  # Target: all positive examples

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the MLP
mlp = MLPClassifier(hidden_layer_sizes=(5,5), solver='adam', activation='relu', max_iter=1000)
mlp.fit(X_train, y_train)

# Predict probabilities
y_prob = mlp.predict_proba(X_test)[:, 1]  # Probability of class 1 (citation)

# Print the probabilities
print("Predicted probabilities:", y_prob)


Predicted probabilities: [9.97579476e-01 9.51371672e-01 7.03243337e-01 ... 1.26583698e-04
 5.99898195e-01 2.48991607e-05]


In [14]:
import pandas as pd

predict_data = pd.read_csv('../data/training/predict/predict_data.csv')


In [15]:
X_predict = predict_data[
    ['num_of_common_authors','source_num_authors','target_num_authors','cosine_similarity', 'source_abstract_length',
     'target_abstract_length']].values  # Features: similarity and abstract lengths
X_predict = scaler.transform(X_predict)  # Standardize the features using the same scaler
# Predict probabilities
res = mlp.predict_proba(X_predict)[:, 1]  # Probability of class 1 (citation)
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
# Rename the 'index' column to 'ID'
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})

# Save the updated DataFrame to a CSV file
resdf[['ID', 'Label']].to_csv('../predictions/predictions.csv', index=False)