**This notebook serves the purpose of cleaning the format of strings and numbers, removing missing values, and tokenization and normalization of paper abstract.**

**Resources consulted online**:
1. https://stackoverflow.com/questions/40476680/how-to-use-tqdm-with-pandas-in-a-jupyter-notebook
2. https://github.com/UChicago-CCA-2021/lucem_illud

# Import libraries

In [1]:
import pandas as pd
import lucem_illud
from tqdm import tqdm
tqdm.pandas()

# Define helperd functions to do data cleaning and preprocessing of paper abstract

In [2]:
# Defines a function that cleans the data into the format we want.
def clean_data(load_file_path='../database/content_analysis.csv'):
    '''
    This function cleans a pandas DataFrame.
    
    Inputs:
        load_file_path: file path to load original CSV file
        save_file_path: file path to save cleaned CSV file

    Returns: a cleaned DataFrame
    '''
    
    # load the CSV file to a DataFrame
    df = pd.read_csv(load_file_path, index_col=0)
    
    # rename columns
    df.rename(columns={'abstract': 'award_abstract', 'url': 'author_url', 'total_citations': 'author_total_citations', 'Title': 'paper_title', 
                   'Year': 'publication_year', 'Cited by': 'paper_total_citations', 'Paper URL': 'paper_url', 'Authors': 'coauthors', 
                   'Publication Date': 'publication_date', 'Journal': 'journal', 'Abstract': 'paper_abstract', 'Citations': 'paper_yearly_citations'}, inplace=True)
    
    # drop columns 
    df.drop(['paper_url', 'publication_date', 'paper_yearly_citations', 'author_url', 'author_total_citations', 'h_index', 'interests', 
             'directorate', 'division', 'effective_date', 'expiration_date', 'award_title', 'award_abstract'], axis=1, inplace=True)

    # reorder columns
    df = df[['first_name', 'middle_name', 'last_name', 'email', 'institution', 'award_year', 'award_amount', 'paper_title', 'journal', 
             'publication_year', 'coauthors', 'paper_abstract', 'paper_total_citations', 'citation_2001', 'citation_2002', 'citation_2003', 
             'citation_2004', 'citation_2005', 'citation_2006', 'citation_2007', 'citation_2008', 'citation_2009', 'citation_2010', 
             'citation_2011', 'citation_2012', 'citation_2013', 'citation_2014', 'citation_2015', 'citation_2016', 'citation_2017', 
             'citation_2018', 'citation_2019', 'citation_2020', 'citation_2021', 'citation_2022', 'citation_2023', 'citation_2024']]

    # convert email names to lowercase
    df['email'] = df['email'].str.lower()
    
    # convert institution names to lowercase
    df['institution'] = df['institution'].str.lower()
    
    # replace NaN values with empty lists in 'coauthors'
    df['coauthors'] = df['coauthors'].apply(lambda x: [] if pd.isna(x) else x)

    # convert each row in 'coauthors' to a list of authors
    df['coauthors'] = df['coauthors'].apply(lambda x: x if isinstance(x, list) else x.split(', '))

    # convert each author name to lowercase
    df['coauthors'] = df['coauthors'].apply(lambda x: [name.lower() for name in x])

    # replace NaN values with 'journal not found' in 'journal'
    df['journal'] = df['journal'].apply(lambda x: 'journal not found' if pd.isna(x) else x)

    # drop rows where 'abstract' is 'abstract not found' 
    df = df[df['paper_abstract'] != 'abstract not found']

    # drop rows where 'abstract' has fewer than 20 words 
    df = df[df['paper_abstract'].astype(str).str.split().apply(len) >= 20]

    # replace missing values in 'total_citation' with 0
    df['paper_total_citations'] = df['paper_total_citations'].fillna(0) 

    # replace missing values in yearly citation columns with 0
    for year in range(2001, 2025):
        df[f'citation_{year}'] = df[f'citation_{year}'].fillna(0)

    # assign data types to 'award_year' and 'publication_year'
    df['award_year'] = pd.to_numeric(df['award_year'], errors='coerce')
    df['award_year'] = df['award_year'].astype('int64').astype('category')
    df['publication_year'] = pd.to_numeric(df['publication_year'], errors='coerce')
    df['publication_year'] = df['publication_year'].astype('int64').astype('category')

    # assign data types to the rest columns
    columns = {f'citation_{year}': 'int64' for year in range(2001, 2025)}
    df = df.astype({'first_name': 'object', 'middle_name': 'object', 'last_name': 'object', 'email': 'object', 'institution': 'category', 
                    'award_amount': 'int64', 'paper_title': 'object', 'journal': 'category', 'coauthors': 'object', 'paper_abstract': 'object', 
                    'paper_total_citations': 'int64'} | columns)

    # reset index
    df = df.reset_index(drop=True)

    # return the final cleaned DataFrame
    return df

In [3]:
def preprocess_data(save_file_path='../database/preprocessed_content_analysis.csv'):
    '''
    This function preprocesses a pandas DataFrame.
    
    Inputs:
        load_file_path: file path to load original CSV file
        save_file_path: file path to save cleaned CSV file
    '''
    
    # Get the cleanned DataFrame
    df = clean_data()
    
    # tokenize 'title' column
    df['tokenized_title'] = df['paper_title'].progress_apply(lambda x: [lucem_illud.word_tokenize(s) for s in lucem_illud.sent_tokenize(x)])
    
    # normalize 'tokenized_title' column
    df['normalized_title'] = df['tokenized_title'].apply(lambda x: [lucem_illud.normalizeTokens(s) for s in x])
    
    # tokenize 'abstract' column
    df['tokenized_abstract'] = df['paper_abstract'].progress_apply(lambda x: [lucem_illud.word_tokenize(s) for s in lucem_illud.sent_tokenize(x)])
    
    # nomalize 'tokenized_abstract' column
    df['normalized_abstract'] = df['tokenized_abstract'].apply(lambda x: [lucem_illud.normalizeTokens(s) for s in x])

    # save the dataframe to a csv file
    df.to_csv(save_file_path)

# Run the functions to clean and preprocess content_analysis dataframe (to get normalized tokens of abstract)

In [4]:
preprocess_data()

100%|██████████| 160258/160258 [15:38<00:00, 170.81it/s]
100%|██████████| 160258/160258 [1:45:45<00:00, 25.25it/s]  
