In [1]:
import pandas as pd
import numpy as np

def read_csv_file(file_path):
    """
    Read a CSV file and return a pandas DataFrame
    """
    try:
        return pd.read_csv(file_path)
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

def get_impact_columns(df):
    """
    Get columns that end with _impact_1, _impact_2, _impact_3, or _impact_mean
    """
    suffixes = ['_1', '_2', '_3', '_mean']
    impact_cols = [col for col in df.columns if any(col.endswith(suffix) for suffix in suffixes)]
    return impact_cols

def get_citation_stats(df, print_ids=False):
    """
    Get statistics for empty impact fields
    """
    impact_cols = get_impact_columns(df)
    
    # Find rows where any impact column is empty
    empty_impacts = df[df[impact_cols].isnull().any(axis=1)]
    
    if not empty_impacts.empty:
        print(f"Total rows with missing impact values: {len(empty_impacts)}")
        
        # Count missing values per column
        missing_per_column = df[impact_cols].isnull().sum()
        print("\nMissing values per column:")
        for col, count in missing_per_column.items():
            print(f"{col}: {count}")
            
        # Print IDs only if requested
        if print_ids:
            print("\nSubmission IDs with missing values:")
            print(empty_impacts['Submission_id'].tolist())
    else:
        print("No empty impact fields found.")
    
    return empty_impacts

def combine_dataframes(base_df, supplement_df):
    """
    Combine two DataFrames using Submission_id as reference
    """
    combined_df = base_df.copy()
    impact_cols = get_impact_columns(combined_df)
    
    # Print statistics before combining
    print("\nStatistics before combining:")
    get_citation_stats(combined_df, print_ids=False)
    
    # Combine data
    filled_count = 0
    for idx, row in supplement_df.iterrows():
        submission_id = row['Submission_id']
        mask = combined_df['Submission_id'] == submission_id
        
        if mask.any():
            for column in impact_cols:
                if (mask.any() and 
                    pd.isnull(combined_df.loc[mask, column]).any() and 
                    not pd.isnull(row[column])):
                    combined_df.loc[mask, column] = row[column]
                    filled_count += 1
    
    print(f"\nTotal impact cells filled: {filled_count}")
    
    # Print only IDs after combining
    print("\nRemaining Submission IDs with missing values after combining:")
    get_citation_stats(combined_df, print_ids=True)
    
    return combined_df

def process_citation_data(base_file, supplement_file, output_file=None):
    """
    Main function to process citation data
    """
    base_df = read_csv_file(base_file)
    supplement_df = read_csv_file(supplement_file)
    
    if base_df is None or supplement_df is None:
        return None
    
    combined_df = combine_dataframes(base_df, supplement_df)
    
    if output_file:
        combined_df.to_csv(output_file, index=False)
        print(f"\nSaved combined data to {output_file}")
    
    return combined_df

In [2]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_citation_impact.csv'
    supplement_file = '/work/output/output_citation_impact_resume1_final.csv'
    output_file = '/work/process/combined_citation_impact.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3784

Missing values per column:
citation_impact_1: 3783
citation_impact_2: 3783
citation_impact_3: 3784
citation_impact_mean: 3783

Total impact cells filled: 15132

Remaining Submission IDs with missing values after combining:
Total rows with missing impact values: 1

Missing values per column:
citation_impact_1: 0
citation_impact_2: 1
citation_impact_3: 0
citation_impact_mean: 0

Submission IDs with missing values:
['S_008892']

Saved combined data to /work/process/combined_citation_impact.csv


In [3]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_grant.csv'
    supplement_file = '/work/output/output_grant_resume1_final.csv'
    output_file = '/work/process/combined_grant.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3921

Missing values per column:
grant_1: 3920
grant_2: 3920
grant_3: 3921
grant_mean: 3920

Total impact cells filled: 15680

Remaining Submission IDs with missing values after combining:
Total rows with missing impact values: 1

Missing values per column:
grant_1: 0
grant_2: 0
grant_3: 1
grant_mean: 0

Submission IDs with missing values:
['S_004716']

Saved combined data to /work/process/combined_grant.csv


In [4]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_nobel_potential.csv'
    supplement_file = '/work/output/output_nobel_potential_resume1_final_2.csv'
    output_file = '/work/process/combined_nobel_potential.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 4175

Missing values per column:
nobel_potential_1: 4173
nobel_potential_2: 4173
nobel_potential_3: 4175
nobel_potential_mean: 4173

Total impact cells filled: 16693

Remaining Submission IDs with missing values after combining:
Total rows with missing impact values: 1

Missing values per column:
nobel_potential_1: 0
nobel_potential_2: 0
nobel_potential_3: 1
nobel_potential_mean: 0

Submission IDs with missing values:
['S_002346']

Saved combined data to /work/process/combined_nobel_potential.csv


In [5]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_research_award.csv'
    supplement_file = '/work/output/output_research_award_resume1_final.csv'
    output_file = '/work/process/combined_research_award.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 4050

Missing values per column:
research_award_1: 4050
research_award_2: 4050
research_award_3: 4050
research_award_mean: 4050

Total impact cells filled: 16200

Remaining Submission IDs with missing values after combining:
No empty impact fields found.

Saved combined data to /work/process/combined_research_award.csv


In [6]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_tenure_eval.csv'
    supplement_file = '/work/output/output_tenure_eval_resume1_final.csv'
    output_file = '/work/process/combined_tenure_eval.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 4397

Missing values per column:
tenure_eval_1: 4396
tenure_eval_2: 4397
tenure_eval_3: 4397
tenure_eval_mean: 4396

Total impact cells filled: 17586

Remaining Submission IDs with missing values after combining:
No empty impact fields found.

Saved combined data to /work/process/combined_tenure_eval.csv


In [7]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_top_conference.csv'
    supplement_file = '/work/output/output_top_conference_resume1_final.csv'
    output_file = '/work/process/combined_top_conference.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3994

Missing values per column:
top_conference_1: 3993
top_conference_2: 3993
top_conference_3: 3994
top_conference_mean: 3993

Total impact cells filled: 15973

Remaining Submission IDs with missing values after combining:
No empty impact fields found.

Saved combined data to /work/process/combined_top_conference.csv


In [8]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_top5_accept.csv'
    supplement_file = '/work/output/output_top5_accept_resume1_final.csv'
    output_file = '/work/process/combined_top5_accept.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3527

Missing values per column:
top5_accept_1: 3526
top5_accept_2: 3527
top5_accept_3: 3527
top5_accept_mean: 3526

Total impact cells filled: 14105

Remaining Submission IDs with missing values after combining:
Total rows with missing impact values: 1

Missing values per column:
top5_accept_1: 0
top5_accept_2: 0
top5_accept_3: 1
top5_accept_mean: 0

Submission IDs with missing values:
['S_008050']

Saved combined data to /work/process/combined_top5_accept.csv


In [9]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_top5_accept_rating.csv'
    supplement_file = '/work/output/output_top5_accept_rating_resume1_final.csv'
    output_file = '/work/process/combined_top5_accept_rating.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3310

Missing values per column:
top5_accept_rating_1: 3310
top5_accept_rating_2: 3310
top5_accept_rating_3: 3310
top5_accept_rating_mean: 3310

Total impact cells filled: 13240

Remaining Submission IDs with missing values after combining:
No empty impact fields found.

Saved combined data to /work/process/combined_top5_accept_rating.csv


In [10]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_top5_accept_rating_criteria.csv'
    supplement_file = '/work/output/output_top5_accept_rating_criteria_final.csv'
    output_file = '/work/process/combined_top5_accept_rating_criteria.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 4081

Missing values per column:
top5_accept_rating_criteria_1: 4080
top5_accept_rating_criteria_2: 4080
top5_accept_rating_criteria_3: 4081
top5_accept_rating_criteria_mean: 4080

Total impact cells filled: 16320

Remaining Submission IDs with missing values after combining:
Total rows with missing impact values: 1

Missing values per column:
top5_accept_rating_criteria_1: 0
top5_accept_rating_criteria_2: 0
top5_accept_rating_criteria_3: 1
top5_accept_rating_criteria_mean: 0

Submission IDs with missing values:
['S_002577']

Saved combined data to /work/process/combined_top5_accept_rating_criteria.csv


In [11]:
# Example usage
if __name__ == "__main__":
    base_file = '/work/output/output_top5_desk.csv'
    supplement_file = '/work/output/output_top5_desk_resume1_final.csv'
    output_file = '/work/process/combined_top5_desk.csv'
    
    combined_df = process_citation_data(
        base_file=base_file,
        supplement_file=supplement_file,
        output_file=output_file
    )


Statistics before combining:
Total rows with missing impact values: 3971

Missing values per column:
top5_desk_1: 3970
top5_desk_2: 3970
top5_desk_3: 3971
top5_desk_mean: 3970

Total impact cells filled: 15881

Remaining Submission IDs with missing values after combining:
No empty impact fields found.

Saved combined data to /work/process/combined_top5_desk.csv


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=a25c250f-64bb-477e-a263-2c8cc56f7dca' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>