In [37]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import re
from tqdm.notebook import tqdm


def clean_text(text):
    text = str(text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

In [38]:
journal_abstract_data = pd.read_csv('economic_journals_abstracts_df.csv', index_col=[0])
journal_abstract_data['Title'] = journal_abstract_data['Title'].apply(clean_text)
journal_abstract_data['Abstract'] = journal_abstract_data['Abstract'].apply(clean_text)
journal_abstract_data['text'] = journal_abstract_data['Title'] + ' ' + journal_abstract_data['Abstract']
journal_abstract_data['text'] = journal_abstract_data['text'].fillna('')
train_data, test_data = train_test_split(journal_abstract_data, test_size=0.001, random_state=42)
train_data, val_data = train_test_split(train_data, test_size=0.1, random_state=42)
train_data = train_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

print(f"Training set size: {len(train_data)}")
print(f"Validation set size: {len(val_data)}")
print(f"Test set size: {len(test_data)}")

Training set size: 133737
Validation set size: 14860
Test set size: 149


In [39]:
journal_abstract_data.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Journal_Website,Journal_Name,Volume_Issue,Title,Authors,Abstract,text
0,0.0,0.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",expensive anomalies,"Deniz Anginer a, Sugata Ray b, H. Nejat Seyhun...",anomalies have higher returns when they are c...,expensive anomalies anomalies have higher ret...
1,1.0,1.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",climate change concerns and mortgage lending,"Tinghua Duan a, Frank Weikai Li b",abnormally high local temperature leads to el...,climate change concerns and mortgage lending ...
2,2.0,2.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",technological disparity and its impact on mark...,"Kiseo Chung a, Seoyoung Kim b",we document substantial technological dispari...,technological disparity and its impact on mark...
3,3.0,3.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",the effect of investor attention on stock pric...,"Ting-Hsuan Chen, Kai-Sheng Chen",stock crash concerns the study addresses the ...,the effect of investor attention on stock pric...
4,4.0,4.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",tail risks and private equity performance,"Hrvoje Kurtović, Garen Markarian",we explore key determinants of private equity...,tail risks and private equity performance we ...


In [40]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Adjust the number of features as needed

# Fit and transform the 'text' column of the training data
tfidf_train_features = tfidf_vectorizer.fit_transform(train_data['text'])

# Transform the 'text' column of the validation and test data
tfidf_val_features = tfidf_vectorizer.transform(val_data['text'])
tfidf_test_features = tfidf_vectorizer.transform(test_data['text'])

In [41]:
def find_distinct_terms(journal, df, tfidf_matrix, features):
    # Filter the DataFrame for the specific journal
    journal_df = df[df['Journal_Name'] == journal]

    # Use the filtered DataFrame's index to access the TF-IDF matrix
    journal_tfidf = tfidf_matrix[journal_df.index]

    # Calculate the average TF-IDF score for each term in the journal
    avg_scores = np.mean(journal_tfidf, axis=0).A1

    # Pair terms with their average scores
    term_scores = [(features[idx], avg_scores[idx]) for idx in range(len(features))]

    # Sort terms by their score, descending
    sorted_terms = sorted(term_scores, key=lambda x: x[1], reverse=True)

    return sorted_terms

In [45]:
features = tfidf_vectorizer.get_feature_names_out()

# Applying the function to each journal and storing results
distinct_terms = {}
for journal in train_data['Journal_Name'].unique():
    distinct_terms[journal] = find_distinct_terms(journal, train_data, tfidf_train_features, features)

In [43]:
def recommend_journal(new_paper_text, tfidf_vectorizer, distinct_terms, features):
    # Generate TF-IDF for the new paper
    new_paper_tfidf = tfidf_vectorizer.transform([new_paper_text]).toarray()

    # Initialize a dictionary to hold similarity scores
    similarity_scores = {}

    # Compare the new paper with each journal's profile
    for journal, terms in distinct_terms.items():
        # Create a vector for the journal's profile
        journal_vector = np.zeros(len(features))
        for term, score in terms:
            if term in features:
                index = np.where(features == term)[0][0]
                journal_vector[index] = score

        # Calculate cosine similarity
        similarity = cosine_similarity([journal_vector], new_paper_tfidf)[0][0]
        similarity_scores[journal] = similarity
    
    # Find the journal with the highest similarity score
    recommended_journal = max(similarity_scores, key=similarity_scores.get)
    
    return recommended_journal

In [46]:
new_paper_text = "An in-depth analysis of financial trends and their implications on global markets."
recommended_journal = recommend_journal(new_paper_text, tfidf_vectorizer, distinct_terms, features)
print(f"The recommended journal for the new paper is: {recommended_journal}")

The recommended journal for the new paper is: Journal of International Money and Finance 


In [47]:
results = []  # Initialize a list to store the results

correct_predictions = 0
total_predictions = len(test_data)

for index, row in tqdm(test_data.iterrows(), total=total_predictions, desc="Evaluating"):
    actual_journal = row['Journal_Name']
    new_paper_text = row['text']
    predicted_journal = recommend_journal(new_paper_text, tfidf_vectorizer, distinct_terms, features)

    # Store the results
    results.append({
        'text': new_paper_text,
        'predicted_journal': predicted_journal,
        'actual_journal': actual_journal
    })

    if predicted_journal == actual_journal:
        correct_predictions += 1

accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")

# Convert the results list to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

Evaluating:   0%|          | 0/149 [00:00<?, ?it/s]

Accuracy: 0.2550335570469799


In [48]:
results_df.iloc

<pandas.core.indexing._iLocIndexer at 0x7f8ba9946c60>

In [32]:
def print_full_results(results_df):
    """
    Print the text, predicted journal, and actual journal for each entry in the results DataFrame.

    Args:
    results_df (DataFrame): A DataFrame containing the columns 'text', 'predicted_journal', and 'actual_journal'.
    """
    for index, row in results_df.iterrows():
        print(f"Paper Text: {row['text']}")
        print(f"Predicted Journal: {row['predicted_journal']}")
        print(f"Actual Journal: {row['actual_journal']}")
        print("-" * 100)

# Now, you can call this function to print the details.
print_full_results(results_df)


Paper Text: representation of measurement error in marketing variables review of approaches and extension to threefacet designs abstract this paper explores approaches for modeling measurement error in marketing research including random method and measure specific sources of error the following approaches are considered classic confirmatory factor analysis secondorder models panel models additive traitmethod models correlated uniqueness models covariance components analysis additive traitmethodmeasure specificerror models and the direct product model where traits and methods interact finally a threefacet multiplicative model is addressed wherein latent variables underlying a phenomenon under investigation are shown to interact with multiple methods and occasions of measurement the threefacet model is illustrated on a study of consumer attitudes toward losing weight explicitly conducted for this paper
Predicted Journal: Journal of Econometrics 
Actual Journal: Journal of Econometrics 


In [1]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for each paper in the dataset
train_embeddings = model.encode(train_data['text'].tolist(), show_progress_bar=True)

NameError: name 'train_data' is not defined

In [74]:
from sklearn.cluster import KMeans

# Assuming you want to create a cluster for each journal
n_clusters = train_data['Journal_Name'].nunique()
kmeans = KMeans(n_clusters=n_clusters, random_state=0)

# Fit k-means with the embeddings
kmeans.fit(train_embeddings)

  super()._check_params_vs_input(X, default_n_init=10)


In [88]:
def recommend_journal_cluster(new_text, model, kmeans, train_data):
    # Generate the embedding for the new text (should be float32 by default)
    new_embedding = model.encode([new_text])[0]


    # Find the nearest cluster
    cluster = kmeans.predict(new_embedding.reshape(1, -1))[0]

    # Recommend the most common journal in the nearest cluster
    recommended_journal = train_data[train_data['cluster'] == cluster]['Journal_Name'].mode()[0]
    return recommended_journal


# Assign cluster labels to your training data for later use
train_data['cluster'] = kmeans.labels_

# Example usage
new_paper_text = "An in-depth analysis of financial trends and their implications on global markets."
recommended_journal = recommend_journal_cluster(new_paper_text, model, kmeans, train_data)
print(f"The recommended journal for the new paper is: {recommended_journal}")

The recommended journal for the new paper is: Journal of International Money and Finance 


In [91]:
# Make sure to assign cluster labels to your train_data if you haven't done so
train_data['cluster'] = kmeans.labels_

# Initialize a list to store the results
results = []

correct_predictions = 0
total_predictions = len(test_data)

for index, row in tqdm(test_data.iterrows(), total=total_predictions, desc="Evaluating"):
    actual_journal = row['Journal_Name']
    new_paper_text = row['text']
    # Use the recommend_journal_cluster function for prediction
    predicted_journal = recommend_journal_cluster(new_paper_text, model, kmeans, train_data)

    # Store the results
    results.append({
        'text': new_paper_text,
        'predicted_journal': predicted_journal,
        'actual_journal': actual_journal
    })

    # Increment correct predictions if the prediction matches the actual journal
    if predicted_journal == actual_journal:
        correct_predictions += 1

# Calculate the accuracy
accuracy = correct_predictions / total_predictions
print(f"Accuracy: {accuracy}")

# Convert the results list to a DataFrame for easier analysis
results_df = pd.DataFrame(results)

Evaluating:   0%|          | 0/117 [00:00<?, ?it/s]

Accuracy: 0.2905982905982906


In [92]:
print_full_results(results_df)

Paper Text: distributional assumptions and a test of the dual labor market hypothesis recent application of the switching regression model to allocate workers into the primary and secondary labor markets is considered to be the best solution to the classification problem of the empirical testing of the dual labor market theory in such models normality of the error terms is assumed this paper adopts the switching regression model to test the dual labor market theory by assuming different distributions of the error terms the test results strongly support the dual labor market theory regardless of the assumption one makes about the error terms however the results indicate that different distribution can lead to different percentage distributions of workers in the two segments in particular the normal distribution generates more workers in the primary segment than the nonnormal distributions therefore care must be taken not to generalize the type of industries or occupations that fall unde