In [1]:
import pandas as pd
# Reading DataFrames
combined_df = pd.read_csv('/work/final/final_combined_data.csv')

In [3]:
# Handle string 'nan' and NaN values for Name_Category
combined_df.loc[combined_df['Name_Category'].astype(str).str.lower() == 'nan', 'Name_Category'] = "Retracted Name"
combined_df.loc[combined_df['Name_Category'].isna(), 'Name_Category'] = "Retracted Name"

# Handle string 'nan' and NaN values for Institution
combined_df.loc[combined_df['Institution'].astype(str).str.lower() == 'nan', 'Institution'] = "Retracted Institution"
combined_df.loc[combined_df['Institution'].isna(), 'Institution'] = "Retracted Institution"


In [5]:
# Print unique values and their types for both columns
print("Name_Category unique values:")
print(combined_df['Name_Category'].unique())
print("\nName_Category value types:")
for val in combined_df['Name_Category'].unique():
    print(f"{val}: {type(val)}")

print("\nInstitution unique values:")
print(combined_df['Institution'].unique())
print("\nInstitution value types:")
for val in combined_df['Institution'].unique():
    print(f"{val}: {type(val)}")

Name_Category unique values:
['Top' 'Bottom' 'Random' 'Retracted Name']

Name_Category value types:
Top: <class 'str'>
Bottom: <class 'str'>
Random: <class 'str'>
Retracted Name: <class 'str'>

Institution unique values:
['Massachusetts Institute of Technology;' 'Harvard University;'
 'University of Warwick;'
 'London School of Economics and Political Science;'
 'University of Tokyo;' 'University of Cape Town;'
 'Nanyang Technological University;' 'Chulalongkorn University;'
 'Universiti Malaya;' 'Retracted Institution']

Institution value types:
Massachusetts Institute of Technology;: <class 'str'>
Harvard University;: <class 'str'>
University of Warwick;: <class 'str'>
London School of Economics and Political Science;: <class 'str'>
University of Tokyo;: <class 'str'>
University of Cape Town;: <class 'str'>
Nanyang Technological University;: <class 'str'>
Chulalongkorn University;: <class 'str'>
Universiti Malaya;: <class 'str'>
Retracted Institution: <class 'str'>


In [7]:
# Replace string "None" with our desired values
combined_df['Name_Category'] = combined_df['Name_Category'].replace("None", "Retracted Name")
combined_df['Institution'] = combined_df['Institution'].replace("None", "Retracted Institution")

In [9]:
combined_df['Name_Category'].unique()

array(['Top', 'Bottom', 'Random', 'Retracted Name'], dtype=object)

In [13]:
import pandas as pd
import re

# Define the name lists
top_researchers = {
    'Andrei Shleifer': ('male', 'top-male-researcher'),
    'Daron Acemoglu': ('male', 'top-male-researcher'),
    'James J. Heckman': ('male', 'top-male-researcher'),
    'Joseph E. Stiglitz': ('male', 'top-male-researcher'),
    'John List': ('male', 'top-male-researcher'),
    'Carmen M. Reinhart': ('female', 'top-female-researcher'),
    'Janet Currie': ('female', 'top-female-researcher'),
    'Esther Duflo': ('female', 'top-female-researcher'),
    'Asli Demirguc-Kunt': ('female', 'top-female-researcher'),
    'Marianne Bertrand': ('female', 'top-female-researcher')
}

bottom_researchers = {
    'Harold Huibing Zhang': ('male', 'bottom-male-researcher'),
    'Lin Zhou': ('male', 'bottom-male-researcher'),
    'Andrei Zlate': ('male', 'bottom-male-researcher'),
    'Ulf Zoelitz': ('male', 'bottom-male-researcher'),
    'Asaf Zussman': ('male', 'bottom-male-researcher'),
    'Lu Yang': ('female', 'bottom-female-researcher'),
    'Anzelika Zaiceva': ('female', 'bottom-female-researcher'),
    'Aleksandra Zdzienicka': ('female', 'bottom-female-researcher'),
    'Qiankun Zhou': ('female', 'bottom-female-researcher'),
    'Vera Zipperer': ('female', 'bottom-female-researcher')
}

random_names = {
    'Bruce S. Green': ('male', 'random-male-researcher'),
    'Alejandro L. James': ('male', 'random-male-researcher'),
    'Billie J. Abels': ('male', 'random-male-researcher'),
    'Paul A. Jenkins': ('male', 'random-male-researcher'),
    'Gary L. Bodie': ('male', 'random-male-researcher'),
    'Gail J. Doan': ('female', 'random-female-researcher'),
    'Shirley S. Hodgkins': ('female', 'random-female-researcher'),
    'Pattie K. Reinhardt': ('female', 'random-female-researcher'),
    'Tara R. Weber': ('female', 'random-female-researcher'),
    'Tabitha J. Cox': ('female', 'random-female-researcher')
}

def extract_author(text):
    """Extract author name from the submission text."""
    if pd.isna(text):
        return None
    match = re.search(r'Author:\s*([^;]+);', text)
    if match:
        return match.group(1).strip()
    return None

def process_dataframe(df):
    """
    Process the DataFrame to add gender and category information.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with a 'Submission' column
    
    Returns:
    tuple: (processed DataFrame, statistics dictionary)
    """
    try:
        # Create a copy to avoid modifying the original DataFrame
        processed_df = df.copy()
        
        # Extract author names from Submission field
        print("Extracting author names...")
        processed_df['Author'] = processed_df['Submission'].apply(extract_author)
        
        # Create a dictionary combining all researchers
        all_researchers = {**top_researchers, **bottom_researchers, **random_names}
        
        # Initialize new columns with "Retracted Name"
        processed_df['gender'] = 'Retracted Name'
        processed_df['gender_name_category'] = 'Retracted Name'
        
        # Update gender and category for matching names
        print("\nMatching names...")
        for name, (gender, category) in all_researchers.items():
            mask = processed_df['Author'].str.contains(name, case=False, na=False, regex=False)
            matches = sum(mask)
            if matches > 0:
                processed_df.loc[mask, 'gender'] = gender
                processed_df.loc[mask, 'gender_name_category'] = category
                print(f"Found {matches} matches for {name}")
        
        # Calculate statistics
        stats = {
            'total_researchers': len(processed_df),
            'matched_researchers': len(processed_df[processed_df['gender'] != 'Retracted Name']),
            'gender_distribution': processed_df['gender'].value_counts().to_dict(),
            'category_distribution': processed_df['gender_name_category'].value_counts().to_dict()
        }
        
        return processed_df, stats
        
    except Exception as e:
        print(f"Error in process_dataframe: {str(e)}")
        raise

# Usage:
# processed_df, stats = process_dataframe(combined_df)
#
# # Print statistics
# print("\nProcessing Statistics:")
# print(f"Total researchers: {stats['total_researchers']}")
# print(f"Matched researchers: {stats['matched_researchers']}")
# 
# print("\nGender Distribution:")
# for gender, count in stats['gender_distribution'].items():
#     print(f"{gender}: {count}")
#     
# print("\nCategory Distribution:")
# for category, count in stats['category_distribution'].items():
#     print(f"{category}: {count}")
#
# # Print sample of matched rows
# matched_rows = processed_df[processed_df['gender'] != 'Retracted Name']
# if not matched_rows.empty:
#     print(matched_rows[['Author', 'gender', 'gender_name_category']].head())

In [15]:
combined_df, stats = process_dataframe(combined_df)

Extracting author names...

Matching names...
Found 300 matches for Andrei Shleifer
Found 300 matches for Daron Acemoglu
Found 300 matches for James J. Heckman
Found 300 matches for Joseph E. Stiglitz
Found 300 matches for John List
Found 300 matches for Carmen M. Reinhart
Found 300 matches for Janet Currie
Found 300 matches for Esther Duflo
Found 300 matches for Asli Demirguc-Kunt
Found 300 matches for Marianne Bertrand
Found 300 matches for Harold Huibing Zhang
Found 300 matches for Lin Zhou
Found 300 matches for Andrei Zlate
Found 300 matches for Ulf Zoelitz
Found 300 matches for Asaf Zussman
Found 300 matches for Lu Yang
Found 300 matches for Anzelika Zaiceva
Found 300 matches for Aleksandra Zdzienicka
Found 300 matches for Qiankun Zhou
Found 300 matches for Vera Zipperer
Found 300 matches for Bruce S. Green
Found 300 matches for Alejandro L. James
Found 300 matches for Billie J. Abels
Found 300 matches for Paul A. Jenkins
Found 300 matches for Gary L. Bodie
Found 300 matches for G

In [17]:
combined_df

Unnamed: 0,Submission_id,Paper_id,Submission,Original_Publication,Name_Category,Institution,top5_desk_1,top5_desk_2,top5_desk_3,top5_desk_mean,...,grant_mean,citation_impact_1,citation_impact_2,citation_impact_3,citation_impact_mean,Author,gender,gender_name_category,pub_quality,pub_quality_numeric
0,S_000001,1,A submission with the following details: Title...,Journal of Political Economy,Top,Massachusetts Institute of Technology;,10.0,9.0,10.0,9.67,...,8.67,55.0,150.0,78.0,94.33,Andrei Shleifer,male,top-male-researcher,High,1.0
1,S_000002,1,A submission with the following details: Title...,Journal of Political Economy,Top,Harvard University;,10.0,10.0,9.0,9.67,...,9.00,150.0,250.0,265.0,221.67,Andrei Shleifer,male,top-male-researcher,High,1.0
2,S_000003,1,A submission with the following details: Title...,Journal of Political Economy,Top,University of Warwick;,10.0,9.0,9.0,9.33,...,9.00,120.0,75.0,125.0,106.67,Andrei Shleifer,male,top-male-researcher,High,1.0
3,S_000004,1,A submission with the following details: Title...,Journal of Political Economy,Top,London School of Economics and Political Science;,9.0,9.0,10.0,9.33,...,9.33,85.0,100.0,120.0,101.67,Andrei Shleifer,male,top-male-researcher,High,1.0
4,S_000005,1,A submission with the following details: Title...,Journal of Political Economy,Top,University of Tokyo;,8.0,9.0,8.0,8.33,...,9.00,198.0,245.0,120.0,187.67,Andrei Shleifer,male,top-male-researcher,High,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9025,S_009026,25,A submission with the following details: Title...,Journal of Applied Economics and Business,Retracted Name,Retracted Institution,5.0,4.0,4.0,4.33,...,5.67,75.0,50.0,180.0,101.67,,Retracted Name,Retracted Name,Unknown,
9026,S_009027,26,A submission with the following details: Title...,Journal of Applied Economics and Business,Retracted Name,Retracted Institution,0.0,1.0,1.0,0.67,...,3.67,45.0,75.0,15.0,45.00,,Retracted Name,Retracted Name,Unknown,
9027,S_009028,30,A submission with the following details: Title...,GPT-o1,Retracted Name,Retracted Institution,8.0,9.0,8.0,8.33,...,8.00,75.0,250.0,200.0,175.00,,Retracted Name,Retracted Name,Fake AI Paper,0.0
9028,S_009029,29,A submission with the following details: Title...,GPT-o1,Retracted Name,Retracted Institution,8.0,9.0,9.0,8.67,...,8.67,75.0,150.0,85.0,103.33,,Retracted Name,Retracted Name,Fake AI Paper,0.0


In [23]:
def add_publication_quality(combined_df):
    """
    Add publication quality column based on journal classification
    High quality: Journal of Political Economy, Econometrica, The Quarterly Journal of Economics
    Medium quality: Economica, Oxford Bulletin of Economics and Statistics, European Economic Review
    """
    # Define journal categories
    low_quality_journals = [
        'Asian Economic and Financial Review (AEFR)',
        'Business and Economics Journal',
        'Journal of Applied Economics and Business'

    ]
    
    Fake_ai_gen = [
        'GPT-o1',
    ]

    high_quality_journals = [
        'Journal of Political Economy',
        'Econometrica',
        'The Quarterly Journal of Economics'
    ]
    
    medium_quality_journals = [
        'Economica',
        'Oxford Bulletin of Economics and Statistics',
        'European Economic Review'
    ]
    
    # Create publication quality column
    def get_quality(publication):
        if publication in low_quality_journals:
            return 'Low'
        elif publication in medium_quality_journals:
            return 'Medium'
        elif publication in high_quality_journals:
            return 'High'
        elif publication in Fake_ai_gen:
            return 'Fake AI Paper'
        else:
            return 'Unknown'  # For any publications not in our lists
    
    # Add new column
    combined_df['pub_quality'] = combined_df['Original_Publication'].apply(get_quality)
    
    # Convert to numeric if needed (High=1, Medium=0)
    quality_map = {'High': 1, 'Medium': 0, 'Low': 2, 'Fake AI Paper': 0, 'Unknown': None}
    combined_df['pub_quality_numeric'] = combined_df['pub_quality'].map(quality_map)
    
    # Display summary
    print("\nPublication Quality Distribution:")
    print(combined_df['pub_quality'].value_counts())
    print("\nSample of results:")
    print(combined_df[['Original_Publication', 'pub_quality', 'pub_quality_numeric']].head())
    
    return combined_df

# Apply the function to combined_df
combined_df = add_publication_quality(combined_df)


Publication Quality Distribution:
pub_quality
High             2709
Medium           2709
Low              2709
Fake AI Paper     903
Name: count, dtype: int64

Sample of results:
           Original_Publication pub_quality  pub_quality_numeric
0  Journal of Political Economy        High                  1.0
1  Journal of Political Economy        High                  1.0
2  Journal of Political Economy        High                  1.0
3  Journal of Political Economy        High                  1.0
4  Journal of Political Economy        High                  1.0


In [27]:
combined_df['Paper_id_n'] = 'Paper_' + combined_df['Paper_id'].astype(str)


In [33]:
print(combined_df['Original_Publication'].value_counts())

Original_Publication
Journal of Political Economy                   903
Econometrica                                   903
The Quarterly Journal of Economics             903
Economica                                      903
Oxford Bulletin of Economics and Statistics    903
European Economic Review                       903
Asian Economic and Financial Review (AEFR)     903
Business and Economics Journal                 903
Journal of Applied Economics and Business      903
GPT-o1                                         903
Name: count, dtype: int64


In [31]:
print(combined_df['Paper_id_n'].value_counts())

Paper_id_n
Paper_1     301
Paper_2     301
Paper_3     301
Paper_5     301
Paper_4     301
Paper_6     301
Paper_9     301
Paper_8     301
Paper_7     301
Paper_12    301
Paper_11    301
Paper_10    301
Paper_15    301
Paper_13    301
Paper_14    301
Paper_16    301
Paper_17    301
Paper_18    301
Paper_19    301
Paper_21    301
Paper_20    301
Paper_24    301
Paper_23    301
Paper_22    301
Paper_26    301
Paper_25    301
Paper_27    301
Paper_30    301
Paper_28    301
Paper_29    301
Name: count, dtype: int64


In [29]:
combined_df

Unnamed: 0,Submission_id,Paper_id,Submission,Original_Publication,Name_Category,Institution,top5_desk_1,top5_desk_2,top5_desk_3,top5_desk_mean,...,citation_impact_1,citation_impact_2,citation_impact_3,citation_impact_mean,Author,gender,gender_name_category,pub_quality,pub_quality_numeric,Paper_id_n
0,S_000001,1,A submission with the following details: Title...,Journal of Political Economy,Top,Massachusetts Institute of Technology;,10.0,9.0,10.0,9.67,...,55.0,150.0,78.0,94.33,Andrei Shleifer,male,top-male-researcher,High,1.0,Paper_1
1,S_000002,1,A submission with the following details: Title...,Journal of Political Economy,Top,Harvard University;,10.0,10.0,9.0,9.67,...,150.0,250.0,265.0,221.67,Andrei Shleifer,male,top-male-researcher,High,1.0,Paper_1
2,S_000003,1,A submission with the following details: Title...,Journal of Political Economy,Top,University of Warwick;,10.0,9.0,9.0,9.33,...,120.0,75.0,125.0,106.67,Andrei Shleifer,male,top-male-researcher,High,1.0,Paper_1
3,S_000004,1,A submission with the following details: Title...,Journal of Political Economy,Top,London School of Economics and Political Science;,9.0,9.0,10.0,9.33,...,85.0,100.0,120.0,101.67,Andrei Shleifer,male,top-male-researcher,High,1.0,Paper_1
4,S_000005,1,A submission with the following details: Title...,Journal of Political Economy,Top,University of Tokyo;,8.0,9.0,8.0,8.33,...,198.0,245.0,120.0,187.67,Andrei Shleifer,male,top-male-researcher,High,1.0,Paper_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9025,S_009026,25,A submission with the following details: Title...,Journal of Applied Economics and Business,Retracted Name,Retracted Institution,5.0,4.0,4.0,4.33,...,75.0,50.0,180.0,101.67,,Retracted Name,Retracted Name,Low,2.0,Paper_25
9026,S_009027,26,A submission with the following details: Title...,Journal of Applied Economics and Business,Retracted Name,Retracted Institution,0.0,1.0,1.0,0.67,...,45.0,75.0,15.0,45.00,,Retracted Name,Retracted Name,Low,2.0,Paper_26
9027,S_009028,30,A submission with the following details: Title...,GPT-o1,Retracted Name,Retracted Institution,8.0,9.0,8.0,8.33,...,75.0,250.0,200.0,175.00,,Retracted Name,Retracted Name,Fake AI Paper,0.0,Paper_30
9028,S_009029,29,A submission with the following details: Title...,GPT-o1,Retracted Name,Retracted Institution,8.0,9.0,9.0,8.67,...,75.0,150.0,85.0,103.33,,Retracted Name,Retracted Name,Fake AI Paper,0.0,Paper_29


# FINAL FINAL

In [35]:
combined_df.to_csv('/work/final/final_final_combined_data.csv', index=False)

In [37]:
# Drop the 'Submission' column
combined_df_drop = combined_df.drop('Submission', axis=1)

# Save the modified DataFrame to a new CSV file
combined_df_drop.to_csv('/work/final/final_final_combined_data_drop_submission.csv', index=False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a25c250f-64bb-477e-a263-2c8cc56f7dca' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>