In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from pathlib import Path
import glob
import re
import numpy as np
import json

In [None]:
df = pd.read_csv('economic_journals_abstracts_df.csv', index_col=[0])
def clean_text(text):
    text = str(text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Convert text to lowercase
    text = text.lower()
    return text

# Clean the 'title' and 'abstract' columns
df['Title'] = df['Title'].apply(clean_text)
df['Abstract'] = df['Abstract'].apply(clean_text)

# Combine title and abstract for more comprehensive analysis
df['text'] = df['Title'] + ' ' + df['Abstract']

In [None]:
print(df.shape)
df.head()

(148746, 9)


Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Journal_Website,Journal_Name,Volume_Issue,Title,Authors,Abstract,text
0,0.0,0.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",expensive anomalies,"Deniz Anginer a, Sugata Ray b, H. Nejat Seyhun...",anomalies have higher returns when they are c...,expensive anomalies anomalies have higher ret...
1,1.0,1.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",climate change concerns and mortgage lending,"Tinghua Duan a, Frank Weikai Li b",abnormally high local temperature leads to el...,climate change concerns and mortgage lending ...
2,2.0,2.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",technological disparity and its impact on mark...,"Kiseo Chung a, Seoyoung Kim b",we document substantial technological dispari...,technological disparity and its impact on mark...
3,3.0,3.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",the effect of investor attention on stock pric...,"Ting-Hsuan Chen, Kai-Sheng Chen",stock crash concerns the study addresses the ...,the effect of investor attention on stock pric...
4,4.0,4.0,Elsevier,Journal of Empirical Finance,"Volume 75, Issue 1",tail risks and private equity performance,"Hrvoje Kurtović, Garen Markarian",we explore key determinants of private equity...,tail risks and private equity performance we ...


In [None]:
# Calculates IDF using Invididual journal

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# We will now compute TF-IDF values for each journal separately
# This is to understand distinct terms in each journal
tfidf_results = {}

for journal in df['Journal_Name'].unique():
    # Filter the DataFrame for the current journal
    journal_data = df[df['Journal_Name'] == journal]

    # Fit and transform the data
    tfidf_matrix = tfidf.fit_transform(journal_data['text'])

    # Sum tfidf values for each term to find its importance in this journal
    sums = tfidf_matrix.sum(axis=0)

    # Mapping from feature integer indices to feature name (word)
    features = tfidf.get_feature_names_out()

    scores = [(word, sums[0, idx]) for word, idx in zip(features, range(sums.shape[1]))]
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Store the results
    tfidf_results[journal] = sorted_scores

# Now `tfidf_results` contains the words sorted by their importance for each journal
# For example, to see the top 10 terms for the first journal in the list:
first_journal = list(tfidf_results.keys())[0]
print(f"Top terms in {first_journal}:")
for term, score in tfidf_results[first_journal][:10]:
    print(f"{term} ({score})")

Top terms in Journal of Empirical Finance :
risk (29.227100659785975)
model (28.578033438034034)
volatility (27.131185865167712)
returns (25.104975436746265)
market (24.14117105261009)
stock (21.97087365451767)
firms (20.957727000254856)
models (16.767135878911272)
trading (16.332257502777022)
information (15.261786683945044)


In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Dictionary to store TF-IDF results for each journal
tfidf_results = {}

for journal in df['Journal_Name'].unique():
    # Filter the DataFrame for the current journal
    journal_data = df[df['Journal_Name'] == journal]

    # Fit and transform the data
    tfidf_matrix = tfidf.fit_transform(journal_data['text'])

    # Sum TF-IDF values for each term to find its importance in this journal
    sums = tfidf_matrix.sum(axis=0)

    # Mapping from feature integer indices to feature name (word)
    features = tfidf.get_feature_names_out()

    # Create a list of (word, score) tuples and sort them by score in descending order
    scores = [(word, sums[0, idx]) for word, idx in zip(features, range(sums.shape[1]))]
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Store the top 20 terms for the journal
    tfidf_results[journal] = sorted_scores[:20]

# Save the results to a JSON file
with open('tfidf_results.json', 'w') as json_file:
    json.dump(tfidf_results, json_file)

In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# List to store all the TF-IDF results
all_tfidf_results = []

for journal in df['Journal_Name'].unique():
    # Filter the DataFrame for the current journal
    journal_data = df[df['Journal_Name'] == journal]

    # Fit and transform the data
    tfidf_matrix = tfidf.fit_transform(journal_data['text'])

    # Sum TF-IDF values for each term to find its importance in this journal
    sums = tfidf_matrix.sum(axis=0)

    # Mapping from feature integer indices to feature name (word)
    features = tfidf.get_feature_names_out()

    # Create a list of (word, score) tuples and sort them by score in descending order
    scores = [(word, sums[0, idx]) for word, idx in zip(features, range(sums.shape[1]))]
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)

    # Create a dictionary for the journal with terms and scores
    journal_dict = {'Journal': journal}
    journal_dict.update({f'Term_{i+1}': term for i, (term, score) in enumerate(sorted_scores[:20])})
    journal_dict.update({f'Score_{i+1}': score for i, (term, score) in enumerate(sorted_scores[:20])})

    # Append the dictionary to the results list
    all_tfidf_results.append(journal_dict)

# Convert the results into a DataFrame
tfidf_df = pd.DataFrame(all_tfidf_results)

# Path for the CSV file
csv_file_path = 'tfidf_individual_journal_for_idf.csv'

# Save the DataFrame to a CSV file
tfidf_df.to_csv(csv_file_path, index=False)

In [None]:
# Calculates IDF using ALL journals

# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit and transform the data for the entire dataset
tfidf_matrix = tfidf.fit_transform(df['text'])
features = tfidf.get_feature_names_out()

# Function to find distinct terms for each journal
def find_distinct_terms(journal):
    # Get indices for the journal
    journal_indices = df[df['Journal_Name'] == journal].index

    # Extract the TF-IDF scores for the journal
    journal_tfidf = tfidf_matrix[journal_indices]

    # Calculate the average TF-IDF score for each term in the journal
    avg_scores = np.mean(journal_tfidf, axis=0).A1  # Convert to a dense array

    # Pair terms with their average scores
    term_scores = [(term, avg_scores[idx]) for idx, term in enumerate(features)]

    # Sort terms by their score, descending
    sorted_terms = sorted(term_scores, key=lambda x: x[1], reverse=True)

    return sorted_terms

# Applying the function to each journal and storing results
distinct_terms = {}
for journal in df['Journal_Name'].unique():
    distinct_terms[journal] = find_distinct_terms(journal)

# Displaying the top 10 distinct terms for the first journal
first_journal = list(distinct_terms.keys())[0]
print(f"Top terms in {first_journal}:")
for term, score in distinct_terms[first_journal][:10]:
    print(f"{term} ({score})")

In [None]:
# Initialize the TF-IDF Vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=10000)

# Fit and transform the data for the entire dataset
tfidf_matrix = tfidf.fit_transform(df['text'])
features = tfidf.get_feature_names_out()

# Function to find distinct terms for each journal
def find_distinct_terms(journal):
    # Get indices for the journal
    journal_indices = df[df['Journal_Name'] == journal].index

    # Extract the TF-IDF scores for the journal
    journal_tfidf = tfidf_matrix[journal_indices]

    # Calculate the average TF-IDF score for each term in the journal
    avg_scores = np.mean(journal_tfidf, axis=0).A1  # Convert to a dense array

    # Pair terms with their average scores
    term_scores = [(term, avg_scores[idx]) for idx, term in enumerate(features)]

    # Sort terms by their score, descending
    sorted_terms = sorted(term_scores, key=lambda x: x[1], reverse=True)

    return sorted_terms[:20]  # Return the top 20 terms

# Applying the function to each journal and storing the top 20 results
distinct_terms = {}
for journal in df['Journal_Name'].unique():
    distinct_terms[journal] = find_distinct_terms(journal)

# Save the results to a JSON file
with open('tfidf_results_fullidf.json', 'w') as json_file:
    json.dump(distinct_terms, json_file)