In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import pandas as pd

# Load and prepare the data
positive_data = pd.read_csv('../data/training/positive/positive_data.csv')
positive_data['label'] = 1

negative_data = pd.read_csv('../data/training/negative/negative_data.csv')
negative_data['label'] = 0

data = pd.concat([positive_data, negative_data], ignore_index=True)
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

In [5]:

# Use cosine similarity and abstract lengths as features
X = data[['source_num_authors', 'target_num_authors', 'jaccard_similarity', 'similarity', 
          'source_abstract_length', 'target_abstract_length']].values
y = data['label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Define and train the logistic regression model
log_reg = LogisticRegression(max_iter=1000,C=0.5, solver='liblinear')
log_reg.fit(X_train, y_train)

# Predict probabilities
y_prob = log_reg.predict_proba(X_test)[:, 1]  # Probability of class 1 (citation)

# Print the probabilities
print("Predicted probabilities:", y_prob)

# Predict on new data
predict_data = pd.read_csv('../data/training/predict/predict_data.csv')
X_predict = predict_data[['source_num_authors', 'target_num_authors', 'jaccard_similarity', 'similarity', 
                          'source_abstract_length', 'target_abstract_length']].values
X_predict = scaler.transform(X_predict)
res = log_reg.predict_proba(X_predict)[:, 1]

# Save results to a CSV file
resdf = pd.DataFrame(res, columns=['probability'])
resdf = resdf.reset_index()
resdf = resdf.rename(columns={'index': 'ID', 'probability': 'Label'})
resdf[['ID', 'Label']].to_csv('../predictions/logistic_regression_predictions.csv', index=False)

Predicted probabilities: [0.48957318 0.48986002 0.49660541 ... 0.50840543 0.49334812 0.50579233]
