In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
# add a column with number of common authors
positive_data['num_of_common_authors'] = positive_data.apply(
    lambda row: len(set(row['source_authors'].split(',')).intersection(set(row['target_authors'].split(',')))), axis=1)
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0
negative_data['num_of_common_authors'] = positive_data.apply(
    lambda row: len(set(row['source_authors'].split(',')).intersection(set(row['target_authors'].split(',')))), axis=1)

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)
data['num_of_common_authors'] = data['num_of_common_authors'].fillna(0)

# Use cosine similarity and abstract lengths as features
X = data[['num_of_common_authors','source_num_authors','target_num_authors','cosine_similarity', 'source_abstract_length',
          'target_abstract_length']].values  # Features: similarity and abstract lengths
y = data['label']  # Target: all positive examples

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the Logistic Regression model
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

# Predict probabilities
y_prob = logreg.predict_proba(X_test)[:, 1]  # Probability of class 1 (citation)

# Print the probabilities
print("Predicted probabilities:", y_prob)

import pandas as pd

predict_data = pd.read_csv('../data/training/predict/predict_data.csv')

X_predict = predict_data[
    ['num_of_common_authors','source_num_authors','target_num_authors','cosine_similarity', 'source_abstract_length',
     'target_abstract_length']].values  # Features: similarity and abstract lengths
X_predict = scaler.transform(X_predict)  # Standardize the features using the same scaler
# Predict probabilities
res = logreg.predict_proba(X_predict)[:, 1]  # Probability of class 1 (citation)
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
# Rename the 'index' column to 'ID'
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})

# Save the updated DataFrame to a CSV file
resdf[['ID', 'Label']].to_csv('../predictions/predictions.csv', index=False)


Predicted probabilities: [0.60075649 0.58343488 0.53490282 ... 0.56360738 0.5251049  0.62016097]
