In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm


def clean_text(text):
    text = str(text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [2]:
journal_abstract_data = pd.read_csv('economic_journals_abstracts_df.csv', index_col=[0])
journal_abstract_data['Title'] = journal_abstract_data['Title'].apply(clean_text)
journal_abstract_data['Abstract'] = journal_abstract_data['Abstract'].apply(clean_text)
journal_abstract_data['text'] = journal_abstract_data['Title'] + ' ' + journal_abstract_data['Abstract']
journal_abstract_data['text'] = journal_abstract_data['text'].fillna('')
train_data, test_data = train_test_split(journal_abstract_data, test_size=0.001, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 133737
Validation set size: 14860
Test set size: 149


In [3]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each paper in the dataset
train_embeddings = model.encode(train_data['text'].tolist(), show_progress_bar=True)

Batches:   0%|          | 0/4180 [00:00<?, ?it/s]

In [4]:
from sklearn.cluster import KMeans

# Assuming you want to create a cluster for each journal
n_clusters = train_data['Journal_Name'].nunique()
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Fit k-means with the embeddings
kmeans.fit(train_embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


In [18]:
def recommend_journal_cluster(new_text, model, kmeans, train_data, top_x):
    # Generate the embedding for the new text (should be float32 by default)
    new_embedding = model.encode([new_text])[0]

    # Find the nearest cluster
    cluster = kmeans.predict(new_embedding.reshape(1, -1))[0]

    # Recommend the top X most common journals in the nearest cluster
    top_journals = train_data[train_data['cluster'] == cluster]['Journal_Name'].value_counts().head(top_x)
    return top_journals.index.tolist()


# Assign cluster labels to your training data for later use
train_data['cluster'] = kmeans.labels_

# Example usage
new_paper_text = "An in-depth analysis of financial trends and their implications on global markets."
recommended_journals = recommend_journal_cluster(new_paper_text, model, kmeans, train_data, top_x=3)
print(f"The recommended journals for the new paper are: {recommended_journals}")

The recommended journals for the new paper are: ['Journal of International Money and Finance ', 'Economic Modelling ', 'Journal of Banking and Finance ']


In [19]:
results = []

correct_predictions = 0
total_predictions = len(test_data)

for index, row in tqdm(test_data.iterrows(), total=total_predictions, desc="Evaluating"):
    actual_journal = row['Journal_Name']
    new_paper_text = row['text']
    # Use the recommend_journal_cluster function for prediction with a specified top X
    predicted_journals = recommend_journal_cluster(new_paper_text, model, kmeans, train_data, top_x=3)

    # Store the results
    results.append({
        'text': new_paper_text,
        'predicted_journals': predicted_journals,
        'actual_journal': actual_journal
    })

    # Increment correct predictions if the actual journal is in the predicted list
    if actual_journal in predicted_journals:
        correct_predictions += 1

# Calculate the accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy:.2f}")

# Convert the results list to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

Evaluating:   0%|          | 0/149 [00:00<?, ?it/s]

Accuracy: 0.45


In [20]:
def print_full_results(results_df):
    """
    Print the text, predicted journal, and actual journal for each entry in the results DataFrame.

    Args:
    results_df (DataFrame): A DataFrame containing the columns 'text', 'predicted_journal', and 'actual_journal'.
    """
    for index, row in results_df.iterrows():
        print(f"Paper Text: {row['text']}")
        print(f"Predicted Journal: {row['predicted_journals']}")
        print(f"Actual Journal: {row['actual_journal']}")
        print("-" * 100)

print_full_results(results_df)

Paper Text: representation of measurement error in marketing variables review of approaches and extension to threefacet designs abstract this paper explores approaches for modeling measurement error in marketing research including random method and measure specific sources of error the following approaches are considered classic confirmatory factor analysis secondorder models panel models additive traitmethod models correlated uniqueness models covariance components analysis additive traitmethodmeasure specificerror models and the direct product model where traits and methods interact finally a threefacet multiplicative model is addressed wherein latent variables underlying a phenomenon under investigation are shown to interact with multiple methods and occasions of measurement the threefacet model is illustrated on a study of consumer attitudes toward losing weight explicitly conducted for this paper
Predicted Journal: ['Junior Common Room', 'Ecological Economics ', 'Journal of Econom