In [11]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')


In [12]:
# Define paths and file names 
target_path = 'Targets/daily_crsp.csv'

firm_charac_path = 'Predictors/CompFirmCharac.csv'
jkp_path = 'Predictors/jkp.csv'
#earnings_path = 'Predictors/earnings_calls.parquet'
#mda_path = 'Predictors/mda_text.parquet'

link_table_path = 'linking_table.csv'

# First date for which we have daily returns
CUTOFF_DATE = '2000-01-03'


In [None]:
def import_sanitize_daily_returns(target_path, nrows=None):
    """
    Import and sanitize the data frame of daily returns.

    Parameters:
     - target_path: the path to the CSV file containing the daily returns 
     - nrows: number of rows to load from the file. None if the whole dataset is to be loaded.
    """
    # Define only the necessary columns to import
    required_columns = ['PERMNO', 'CUSIP', 'PERMCO', 'DlyCalDt', 'SICCD', 'NAICS', 'DlyRet', 'sprtrn']

    # Load only the required columns
    daily_data = pd.read_csv(target_path, usecols=required_columns, nrows=nrows)

    # Check NaN counts per column
    print("NaN counts per column:")
    nan_counts = daily_data[['PERMNO', 'DlyCalDt', 'DlyRet']].isna().sum()
    for col, count in nan_counts.items():
        print(f"{col}: {count:,}")

    print("\n" + "="*50)

    # Analyze NaN patterns across the three columns
    subset_cols = ['PERMNO', 'DlyCalDt', 'DlyRet']

    # Create boolean mask for NaNs
    nan_mask = daily_data[subset_cols].isna()

    # Count different NaN patterns
    print("NaN pattern analysis:")
    print(f"Rows with NaN in PERMNO only: {(nan_mask['PERMNO'] & ~nan_mask['DlyCalDt'] & ~nan_mask['DlyRet']).sum():,}")
    print(f"Rows with NaN in DlyCalDt only: {(~nan_mask['PERMNO'] & nan_mask['DlyCalDt'] & ~nan_mask['DlyRet']).sum():,}")
    print(f"Rows with NaN in DlyRet only: {(~nan_mask['PERMNO'] & ~nan_mask['DlyCalDt'] & nan_mask['DlyRet']).sum():,}")

    nan_by_permno = daily_data[daily_data['DlyRet'].isna()].groupby('PERMNO').size()



    nan_by_date = daily_data[daily_data['DlyRet'].isna()].groupby('DlyCalDt').size()



    # Fast vectorized approach using groupby
    permno_stats = daily_data.groupby('PERMNO').agg({
        'DlyRet': ['count', 'size', lambda x: x.isna().sum()]
    }).round(2)

    # Flatten column names
    permno_stats.columns = ['Valid_Count', 'Total_Observations', 'NaN_Count']

    # Calculate NaN percentage
    permno_stats['NaN_Percentage'] = (permno_stats['NaN_Count'] / permno_stats['Total_Observations']) * 100

    # Filter only PERMNOs that have NaNs
    permnos_with_nans_stats = permno_stats[permno_stats['NaN_Count'] > 0]




    # Overall statistics
    total_observations = len(daily_data)
    total_nans = daily_data['DlyRet'].isna().sum()
    overall_nan_percentage = (total_nans / total_observations) * 100

    print(f"Total observations: {total_observations:,}")
    print(f"Total NaN values: {total_nans:,}")
    print(f"Overall NaN percentage: {overall_nan_percentage:.2f}%")
    
    # Calculate NaN percentage for each PERMNO
    permno_stats = daily_data.groupby('PERMNO').agg({'DlyRet': ['size', lambda x: x.isna().sum()]})
    
    permno_stats.columns = ['Total_Observations', 'NaN_Count']
    permno_stats['NaN_Percentage'] = (permno_stats['NaN_Count'] / permno_stats['Total_Observations']) * 100

    # Find PERMNOs with more than 30% missing data
    permnos_to_remove = permno_stats[permno_stats['NaN_Percentage'] > 30].index

    print(f"PERMNOs with >30% missing data: {len(permnos_to_remove):,}")
    print(f"Total PERMNOs before filtering: {daily_data['PERMNO'].nunique():,}")

    # Remove entire rows for PERMNOs with >30% missing data
    daily_data = daily_data[~daily_data['PERMNO'].isin(permnos_to_remove)] 


    # Replace remaining NaNs with mean of respective PERMNO
    daily_data['DlyRet'] = daily_data.groupby('PERMNO')['DlyRet'].transform(
        lambda x: x.fillna(x.mean()))
    

    # Compute quantile thresholds for winsorization
    lower_quantile = daily_data['DlyRet'].quantile(0.00005)
    upper_quantile = daily_data['DlyRet'].quantile(0.99995)


    # Identify outliers for reporting
    outliers = (daily_data['DlyRet'] < lower_quantile) | (daily_data['DlyRet'] > upper_quantile)

    print(f"Number of daily return outliers: {outliers.sum():,}")

    # Winsorize: cap values at the quantile thresholds
    daily_data['DlyRet'] = daily_data['DlyRet'].clip(lower=lower_quantile, upper=upper_quantile)



    # Handle CUSIP NaN values
    nan_counts_CUSIP = daily_data[['CUSIP']].isna().sum()
    print(f"Number of NaN CUSIPs before fixing: {nan_counts_CUSIP['CUSIP']:,}")
    
    if nan_counts_CUSIP['CUSIP'] > 0:
        cusip_before_fill = daily_data['CUSIP'].isna().sum()
        
        # Fill NaN CUSIPs with the first non-null CUSIP for each PERMNO using forward fill
        daily_data['CUSIP'] = daily_data.groupby('PERMNO')['CUSIP'].transform('first')
        
        cusip_after_fill = daily_data['CUSIP'].isna().sum()
        
        print(f"CUSIP NaNs filled: {cusip_before_fill - cusip_after_fill:,}")
        print(f"Remaining CUSIP NaNs: {cusip_after_fill:,}")
        
        # Remove rows with remaining CUSIP NaNs
        if cusip_after_fill > 0:
            rows_before_removal = len(daily_data)
            daily_data = daily_data.dropna(subset=['CUSIP'])
            rows_after_removal = len(daily_data)
            removed_rows = rows_before_removal - rows_after_removal
            print(f"Rows removed due to missing CUSIP: {removed_rows:,}")

    # Convert dates
    daily_data['DlyCalDt'] = pd.to_datetime(daily_data['DlyCalDt'], errors='coerce')

    # Rename date column
    daily_data = daily_data.rename(columns={'DlyCalDt': 'date'})

    # Sort and deduplicate
    daily_data = daily_data.sort_values(by=['date', 'PERMCO', 'PERMNO', 'CUSIP'])
    daily_data = daily_data.drop_duplicates(subset=['PERMNO', 'date'])

    # Set multi-index
    daily_data = daily_data.set_index(['date', 'PERMCO', 'PERMNO', 'CUSIP'])

    

    return daily_data

In [None]:
daily_data = import_sanitize_daily_returns(target_path, nrows=None)

NaN counts per column:
PERMNO: 0
DlyCalDt: 0
DlyRet: 633,536

NaN pattern analysis:
Rows with NaN in PERMNO only: 0
Rows with NaN in DlyCalDt only: 0
Rows with NaN in DlyRet only: 633,536
Total observations: 47,333,141
Total NaN values: 633,536
Overall NaN percentage: 1.34%
PERMNOs with >30% missing data: 225
Total PERMNOs before filtering: 23,660
Number of daily return outliers: 4,648
Number of NaN CUSIPs before fixing: 14,884
CUSIP NaNs filled: 14,881
Remaining CUSIP NaNs: 3
Rows removed due to missing CUSIP: 3


In [None]:
display(daily_data)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SICCD,NAICS,DlyRet,sprtrn
date,PERMCO,PERMNO,CUSIP,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-03,5,15580,04820910,6320,0,0.000000,-0.009549
2000-01-03,7,14593,03783310,3573,0,0.088754,-0.009549
2000-01-03,25,62770,03216510,6711,0,-0.061489,-0.009549
2000-01-03,29,59184,03522910,2082,0,-0.012346,-0.009549
2000-01-03,33,59248,21701610,2082,0,-0.034524,-0.009549
...,...,...,...,...,...,...,...
2024-12-31,60118,26023,G8688010,9999,0,-0.000393,-0.004285
2024-12-31,60120,26027,G8924F10,9999,523910,-0.194924,-0.004285
2024-12-31,60121,26034,G9876W10,9999,0,0.108590,-0.004285
2024-12-31,60122,26035,G9877T10,9999,551112,-0.034979,-0.004285


In [None]:
daily_data.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 46468770 entries, (Timestamp('2000-01-03 00:00:00'), np.int64(5), np.int64(15580), '04820910') to (Timestamp('2024-12-31 00:00:00'), np.int64(60123), np.int64(26037), '98980W10')
Data columns (total 4 columns):
 #   Column  Dtype  
---  ------  -----  
 0   SICCD   int64  
 1   NAICS   int64  
 2   DlyRet  float64
 3   sprtrn  float64
dtypes: float64(2), int64(2)
memory usage: 1.7+ GB


In [None]:
daily_data.describe()

Unnamed: 0,SICCD,NAICS,DlyRet,sprtrn
count,46468770.0,46468770.0,46468770.0,46468770.0
mean,5784.461,372212.1,0.0004534359,0.0002927108
std,2215.948,207038.8,0.03895726,0.01218518
min,0.0,0.0,-0.581294,-0.119841
25%,3820.0,311812.0,-0.010796,-0.004849
50%,6410.0,511210.0,0.0,0.000589
75%,6726.0,525990.0,0.010432,0.006002
max,9999.0,999990.0,1.167635,0.1158


In [None]:
import os

def save_sanitized_csv(df, original_path, date_format='%Y-%m-%d'):
    """
    Save the sanitized DataFrame to a new CSV with '_sanitized' appended to the original filename.

    Parameters:
     - df: The sanitized DataFrame (with a datetime index).
     - original_path: The original CSV file path.
     - date_format: Format for datetime values in the output file.
    """
    # Get directory, filename, and extension
    base, ext = os.path.splitext(original_path)
    new_path = f"{base}_sanitized{ext}"

    # Reset index to save multi-index as columns
    df_to_save = df.reset_index()

    # Write to CSV with datetime formatting
    df_to_save.to_csv(new_path, index=False, date_format=date_format)

    print(f"Sanitized data saved to: {new_path}")


In [None]:
save_sanitized_csv(daily_data, target_path)

Sanitized data saved to: Targets/daily_crsp_sanitized.csv


In [14]:
def import_sanitize_firm_charac(
    firm_charac_path, 
    nrows=None, 
    missing_threshold=0.3, 
    required_unique=500, 
    cutoff_date='2000-01-03'
):
    """
    Imports and sanitizes the dataset containing firm characteristics.

    Parameters:
     - firm_charac_path: path to CSV file containing the data
     - nrows: number of rows to load in memory (None if the whole dataset is to be loaded)
     - missing_threshold: if a column contains more than missing_threshold*100% missing values, it is dropped
     - required_unique: if a column has fewer than required_unique unique values, it is dropped
     - cutoff_date: data starts from this date (inclusive)
    """
    # Load data
    comp = pd.read_csv(firm_charac_path, nrows=nrows, low_memory=False)
    
    # Convert datadate to datetime
    comp['datadate'] = pd.to_datetime(comp['datadate'], errors='coerce')

    # Filter by date
    cutoff = pd.to_datetime(cutoff_date)
    comp = comp[comp['datadate'] >= cutoff]

    # Drop rows where gvkey or datadate is missing
    comp = comp.dropna(subset=['gvkey', 'datadate'])

    # Drop duplicate (gvkey, datadate) pairs
    comp = comp.drop_duplicates(subset=['gvkey', 'datadate'])

    # Rename datadate to date for consistency
    comp = comp.rename(columns={'datadate': 'date'})

    # Handle CUSIP NaN values
    if 'cusip' in comp.columns:
        nan_counts_CUSIP = comp[['cusip']].isna().sum()
        print(f"Number of NaN CUSIPs before fixing: {nan_counts_CUSIP['cusip']:,}")
        
        if nan_counts_CUSIP['cusip'] > 0:
            cusip_before_fill = comp['cusip'].isna().sum()
            
            # Fill NaN CUSIPs with the first non-null CUSIP for each gvkey
            comp['cusip'] = comp.groupby('gvkey')['cusip'].transform('first')
            
            cusip_after_fill = comp['cusip'].isna().sum()
            
            print(f"CUSIP NaNs filled: {cusip_before_fill - cusip_after_fill:,}")
            print(f"Remaining CUSIP NaNs: {cusip_after_fill:,}")
            
            # Remove rows with remaining CUSIP NaNs
            if cusip_after_fill > 0:
                rows_before_removal = len(comp)
                comp = comp.dropna(subset=['cusip'])
                rows_after_removal = len(comp)
                removed_rows = rows_before_removal - rows_after_removal
                print(f"Rows removed due to missing CUSIP: {removed_rows:,}")

    # Sort data frame and set multi-index
    comp = comp.sort_values(by=['date', 'gvkey', 'cusip'])
    comp = comp.set_index(['date', 'gvkey', 'cusip'])

    # Drop identifier columns
    identifiers = ['tic', 'conm', 'exchg', 'cik', 'costat', 'fic']
    comp = comp.drop(columns=[col for col in identifiers if col in comp.columns])


    # Drop columns with too many missing values

    valid_cols = comp.columns[comp.isna().mean() < missing_threshold]
    comp = comp[valid_cols]


    # Drop columns with too few unique values
    comp = comp.loc[:, comp.nunique(dropna=True) >= required_unique]

    # Calculate missing percentage per firm (gvkey)
    firm_missing_pct = comp.groupby(level='gvkey').apply(
        lambda firm: firm.isna().sum().sum() / (len(firm) * len(firm.columns))
    )


    # Identify firms to keep (those with <= 30% missing data)
    firms_to_keep = firm_missing_pct[firm_missing_pct <= missing_threshold].index
    print(f"Firms with ≤{missing_threshold*100}% missing data: {len(firms_to_keep)}")

    comp_filtered = comp[comp.index.get_level_values('gvkey').isin(firms_to_keep)]






    # Store original missing pattern for diagnostics
    missing_before = comp_filtered.isna().sum()
        
    # Group by firm (gvkey) and forward fill, then backward fill
    comp_filtered = comp_filtered.groupby(level='gvkey').fillna(method='ffill')
    comp_filtered = comp_filtered.groupby(level='gvkey').fillna(method='bfill')

    # For each date, fill NaNs with median of that date
    comp_filtered = comp_filtered.groupby(level='date').apply(lambda group: group.fillna(group.median())).droplevel(0)

   
    # =============================================
    # IMPUTATION DIAGNOSTICS
    # =============================================
    missing_after = comp_filtered.isna().sum()
    reduction = missing_before - missing_after
    reduction_pct = ((missing_before - missing_after) / missing_before * 100).fillna(0)

    print(f"Remaining missing values: {comp_filtered.isna().sum().sum()}")
    print(f"Overall reduction in missing values: {reduction.sum()} ({reduction_pct.mean():.1f}% average)")


    summary_df = pd.DataFrame({
        'Before': missing_before,
        'After': missing_after,
        'Reduction': reduction,
        'Reduction %': reduction_pct.round(1)
    })

    # Sort by reduction percentage and display
    summary_df = summary_df.sort_values('Reduction %', ascending=False)
    print(summary_df)

    # Print diagnostics (your existing code)
    print(f"\nAll remaining column names ({len(comp_filtered.columns)}):")
    print(comp_filtered.columns.tolist())



    

    return comp_filtered

In [None]:
firms = import_sanitize_firm_charac(firm_charac_path, cutoff_date=CUTOFF_DATE)


Number of NaN CUSIPs before fixing: 612
CUSIP NaNs filled: 0
Remaining CUSIP NaNs: 612
Rows removed due to missing CUSIP: 612
Firms with ≤30.0% missing data: 21418
Remaining missing values: 0
Overall reduction in missing values: 1828890 (100.0% average)
        Before  After  Reduction  Reduction %
acchgy   78229      0      78229        100.0
capxy    81475      0      81475        100.0
chechy   75796      0      75796        100.0
cogsy    32051      0      32051        100.0
cshfdy   38019      0      38019        100.0
cshpry   32617      0      32617        100.0
cstkey   20907      0      20907        100.0
dilady   47468      0      47468        100.0
dilavy   47639      0      47639        100.0
doy      27414      0      27414        100.0
dvpy     21125      0      21125        100.0
dvy      84453      0      84453        100.0
epsfiy   47609      0      47609        100.0
epsfxy   47715      0      47715        100.0
epspiy   47462      0      47462        100.0
epspxy   4

In [None]:
display(firms)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acchgy,capxy,chechy,cogsy,cshfdy,cshpry,cstkey,dilady,dilavy,doy,...,oiadpy,opepsy,piy,revty,saley,spiy,txty,xidoy,xiy,xopry
date,gvkey,cusip,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2000-01-31,1013,000886309,0.0,46.785,21.138,291.300,182.525,174.925,0.0,0.0,53.100,0.000,...,67.400,0.31,68.400,593.900,593.900,0.000,15.300,0.000,0.0,496.900
2000-01-31,1082,81765M106,0.0,9.597,-2.956,117.073,2.936,2.936,0.0,0.0,0.913,-2.563,...,5.287,0.31,1.503,131.177,131.177,0.000,0.590,-2.563,0.0,123.541
2000-01-31,1173,008015307,0.0,0.558,-0.754,14.390,3.937,3.937,0.0,0.0,0.260,0.000,...,0.894,0.07,0.482,23.271,23.271,0.000,0.222,0.000,0.0,21.847
2000-01-31,1183,45169P106,0.0,1.091,22.224,9.732,28.169,28.169,0.0,0.0,-14.056,0.741,...,-3.652,-0.13,-17.342,7.476,7.476,-15.974,-3.286,0.741,0.0,9.732
2000-01-31,1189,008489502,0.0,17.181,-12.630,724.161,31.445,30.435,0.0,0.0,17.295,-0.007,...,39.178,0.57,31.030,779.820,779.820,0.000,12.136,-0.007,0.0,724.161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-31,34132,06652N107,0.0,4.197,53.154,9.600,9.552,9.422,0.0,0.0,10.336,0.000,...,16.473,1.10,13.713,9658.000,32.195,0.000,3.377,0.000,0.0,15.722
2025-03-31,62243,913290102,0.0,0.693,-14.338,14.867,10.247,10.054,0.0,0.0,11.598,0.000,...,18.507,1.15,15.424,9658.000,42.902,0.000,3.826,0.000,0.0,24.395
2025-03-31,124434,09290D101,0.0,255.000,4026.000,3443.000,156.600,155.000,0.0,0.0,1510.000,0.000,...,1833.000,10.39,1763.000,5276.000,5276.000,-135.000,248.000,0.000,0.0,3443.000
2025-03-31,185243,12618F105,0.0,2.395,-46.068,23.666,2.077,2.077,0.0,0.0,2.700,0.000,...,24.537,1.30,14.407,9658.000,75.407,0.000,2.823,0.000,0.0,50.870


In [None]:
firms.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 873369 entries, (Timestamp('2000-01-31 00:00:00'), np.int64(1013), '000886309') to (Timestamp('2025-04-30 00:00:00'), np.int64(35223), '75644T100')
Data columns (total 39 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   acchgy  873369 non-null  float64
 1   capxy   873369 non-null  float64
 2   chechy  873369 non-null  float64
 3   cogsy   873369 non-null  float64
 4   cshfdy  873369 non-null  float64
 5   cshpry  873369 non-null  float64
 6   cstkey  873369 non-null  float64
 7   dilady  873369 non-null  float64
 8   dilavy  873369 non-null  float64
 9   doy     873369 non-null  float64
 10  dvpy    873369 non-null  float64
 11  dvy     873369 non-null  float64
 12  epsfiy  873369 non-null  float64
 13  epsfxy  873369 non-null  float64
 14  epspiy  873369 non-null  float64
 15  epspxy  873369 non-null  float64
 16  exrey   873369 non-null  float64
 17  fiaoy   873369 non-null  float64
 18  finc

In [None]:
save_sanitized_csv(firms, firm_charac_path)

Sanitized data saved to: Predictors/CompFirmCharac_sanitized.csv


In [9]:
def import_sanitize_jkp(jkp_path, nrows=None, cutoff_date='1925-05-30'):
    """
    Import and sanitize the factors. This function returns a data frame indexed by date
    containing only the returns of the factors. Columns location, frequency, weighting, 
    direction, n_stocks and n_stocks_min are dropped in the process because they are 
    uninformative. 
    Parameters: 
     - jkp_path: path to csv file containing the jkp factors. 
     - nrows: number of rows to load in memory. All of the data is loaded when nrows=None.
     - cutoff_date: the data starts from this date
    """
    jkp = pd.read_csv(jkp_path, nrows = 100000)
    
    jkp['date'] = pd.to_datetime(jkp['date'])    # Parse date column
    cutoff = pd.to_datetime(cutoff_date)
    jkp = jkp[jkp['date'] >= cutoff]        # Remove observations before 2000 as this is irrelevant for us

    # Pivot to wide format: one column per factor
    jkp_wide = jkp.pivot(index='date', columns='name', values='ret')

    return jkp_wide


In [5]:
jkp = import_sanitize_jkp(jkp_path, cutoff_date = CUTOFF_DATE)

In [6]:
jkp

name,age,aliq_at,aliq_mat,ami_126d,at_be,at_gr1,at_me,at_turnover,be_gr1a,be_me,...,opex_at,pi_nix,ppeinv_gr1a,prc,prc_highprc_252d,qmj,qmj_growth,qmj_prof,qmj_safety,rd5_at
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2000-01-31,0.030254,-0.015875,0.018536,0.058705,0.048212,0.008562,-0.046928,-0.040382,0.011158,-0.037480,...,-0.025649,-0.037852,-0.008723,0.029981,0.004210,-0.040257,-0.035010,-0.057493,-0.000004,0.139700
2000-02-29,0.259166,-0.222644,0.138305,0.126145,0.302350,-0.107640,-0.347456,-0.003528,-0.120481,-0.292134,...,0.038776,-0.058175,-0.108115,-0.078144,0.142095,-0.009524,-0.036718,-0.052376,0.089605,0.399325
2000-03-31,-0.211559,0.113484,-0.059384,-0.097385,-0.161972,0.060418,0.241058,0.068863,0.021303,0.204914,...,-0.006270,0.034848,0.044282,0.088231,-0.158170,0.031144,0.013444,0.126156,-0.039629,-0.217580
2000-04-30,-0.225892,0.130952,-0.035645,-0.057125,-0.124413,0.092468,0.199574,0.070121,0.057028,0.170982,...,-0.001731,0.007230,0.073980,0.034441,0.053232,0.013586,0.014560,0.121457,-0.041360,-0.073432
2000-05-31,-0.136842,0.138348,-0.028966,-0.031415,-0.114599,0.114336,0.151526,-0.020165,0.102710,0.124502,...,-0.029297,0.014042,0.080366,0.017054,0.069763,-0.001198,0.005664,0.060433,-0.032958,-0.093934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-08-31,-0.009159,0.010798,0.018197,-0.035742,-0.027581,-0.009432,-0.012035,-0.008847,0.004118,-0.020158,...,-0.009840,-0.005098,0.009275,-0.027720,0.035217,0.010192,-0.005674,0.014741,0.012970,0.009868
2024-09-30,0.015388,0.005373,0.007651,-0.009591,-0.007449,0.003045,-0.009871,0.003660,0.008696,-0.019773,...,0.005259,0.006355,0.008296,-0.006779,0.003206,-0.006422,-0.002243,0.002634,-0.018560,-0.017818
2024-10-31,0.030176,-0.017033,-0.009476,0.002699,-0.020468,-0.015150,0.006831,-0.030677,0.002992,-0.004121,...,-0.029799,0.003082,0.002214,0.012406,0.021771,-0.006040,0.002241,-0.015539,0.002140,0.024123
2024-11-30,0.055421,-0.058225,-0.021419,0.017654,-0.017568,-0.029310,0.003207,-0.008216,-0.014642,-0.015056,...,0.004748,0.001879,0.006882,0.007938,0.016876,-0.005569,0.011715,-0.017477,-0.023964,0.019916


In [7]:
save_sanitized_csv(jkp, jkp_path)

Sanitized data saved to: Predictors/jkp_sanitized.csv


In [21]:
def import_sanitize_linking_table(link_table_path):
    ccm = pd.read_csv(link_table_path)

    # Convert start date
    ccm['LINKDT'] = pd.to_datetime(ccm['LINKDT'])

    # Replace 'E' with a placeholder date, then convert and fill missing values
    ccm['LINKENDDT'] = ccm['LINKENDDT'].replace('E', '2099-12-31')
    ccm['LINKENDDT'] = pd.to_datetime(ccm['LINKENDDT'], errors='coerce')
    ccm['LINKENDDT'] = ccm['LINKENDDT'].fillna(pd.to_datetime('2099-12-31'))

    # Rename columns for merge compatibility
    ccm.rename(columns={'GVKEY': 'gvkey', 'LPERMNO': 'PERMNO', 'LPERMCO': 'PERMCO'}, inplace=True)

    ccm = ccm.set_index('gvkey')

    return ccm


In [22]:
links = import_sanitize_linking_table(link_table_path)

In [10]:
links

Unnamed: 0_level_0,LINKTYPE,PERMNO,PERMCO,LINKDT,LINKENDDT
gvkey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1000,LU,25881,23369,1970-11-13,1978-06-30
1001,LU,10015,6398,1983-09-20,1986-07-31
1002,LC,10023,22159,1972-12-14,1973-06-05
1003,LU,10031,6672,1983-12-07,1989-08-16
1004,LU,54594,20000,1972-04-24,2099-12-31
...,...,...,...,...,...
352262,LC,23773,59507,2023-03-17,2099-12-31
353444,LC,23209,59330,2022-07-22,2099-12-31
355398,LC,25134,59886,2024-05-17,2099-12-31
356128,LC,24704,59765,2024-01-19,2099-12-31


In [23]:
save_sanitized_csv(links, link_table_path)

Sanitized data saved to: linking_table_sanitized.csv


In [15]:
def clean_by_common_cusips(daily_data, firms):
    # Clean and extract CUSIPs from indexes
    daily_cusips = daily_data.index.get_level_values('CUSIP').str.strip().str.upper().unique()
    firms_cusips_full = firms.index.get_level_values('cusip').str.strip().str.upper().unique()

    # Truncate firms cusips to 8 chars
    firms_cusips = firms_cusips_full.str[:8]

    # Find intersection
    common_cusips = daily_cusips.intersection(firms_cusips)

    # Rows before filtering
    daily_rows_before = len(daily_data)
    firms_rows_before = len(firms)

    # Filter datasets
    daily_data_filtered = daily_data[daily_data.index.get_level_values('CUSIP').str.strip().str.upper().isin(common_cusips)]
    mask_firms = firms.index.get_level_values('cusip').str.strip().str.upper().str[:8].isin(common_cusips)
    firms_filtered = firms[mask_firms].copy()

    # Update firms_filtered index to keep truncated cusip (first 8 chars)
    # Assuming 'cusip' is a level in a MultiIndex, rebuild the index with truncated cusip
    idx_names = firms_filtered.index.names
    # Get the levels as a list of arrays
    idx_levels = list(firms_filtered.index.levels)
    idx_codes = list(firms_filtered.index.codes)

    # Find the position of 'cusip' level
    cusip_level_pos = idx_names.index('cusip')

    # Replace the 'cusip' level with truncated values for filtered data
    # First get current cusip values (full 9 char)
    current_cusips = firms_filtered.index.get_level_values('cusip').str[:8]
    
    # Create new index tuples replacing cusip with truncated values
    # Build a new MultiIndex with the truncated cusip level
    new_tuples = []
    for tup in firms_filtered.index:
        tup_as_list = list(tup)
        tup_as_list[cusip_level_pos] = tup_as_list[cusip_level_pos][:8]
        new_tuples.append(tuple(tup_as_list))

    firms_filtered.index = pd.MultiIndex.from_tuples(new_tuples, names=idx_names)

    # Rows after filtering
    daily_rows_after = len(daily_data_filtered)
    firms_rows_after = len(firms_filtered)

    print(f"Rows lost in daily_data: {daily_rows_before - daily_rows_after}")
    print(f"Rows lost in firms: {firms_rows_before - firms_rows_after}")

    return daily_data_filtered, firms_filtered


In [16]:
daily_data_final, firms_final = clean_by_common_cusips(daily_data, firms)

Rows lost in daily_data: 26045036
Rows lost in firms: 485824


In [17]:
display(daily_data_final)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,SICCD,NAICS,DlyRet,sprtrn
date,PERMCO,PERMNO,CUSIP,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2000-01-03,5,15580,04820910,6320,0,0.000000,-0.009549
2000-01-03,7,14593,03783310,3573,0,0.088754,-0.009549
2000-01-03,25,62770,03216510,6711,0,-0.061489,-0.009549
2000-01-03,29,59184,03522910,2082,0,-0.012346,-0.009549
2000-01-03,36,12627,03037B10,8740,0,0.068182,-0.009549
...,...,...,...,...,...,...,...
2024-12-31,60051,25759,87165D20,9999,325411,0.055156,-0.004285
2024-12-31,60058,25773,74162310,2086,312111,0.006542,-0.004285
2024-12-31,60069,25805,09076J20,9999,424590,0.045685,-0.004285
2024-12-31,60090,25886,81764X10,9999,541511,-0.010104,-0.004285


In [18]:
display(firms_final)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,acchgy,capxy,chechy,cogsy,cshfdy,cshpry,cstkey,dilady,dilavy,doy,...,oiadpy,opepsy,piy,revty,saley,spiy,txty,xidoy,xiy,xopry
date,gvkey,cusip,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2000-01-31,1173,00801530,0.0,0.558,-0.754,14.390,3.937,3.937,0.0,0.0,0.260,0.00,...,0.894,0.07,0.482,23.271,23.271,0.000,0.222,0.000,0.000,21.847
2000-01-31,1240,01310410,0.0,1837.000,114.776,26252.071,423.245,422.320,0.0,0.0,427.389,0.00,...,1673.320,1.68,899.810,37478.079,37478.079,-432.881,472.421,-23.272,-23.272,34892.973
2000-01-31,1410,00095710,0.0,3.546,0.059,370.160,23.209,22.261,0.0,0.0,7.399,0.00,...,13.398,0.33,12.757,428.581,428.581,0.000,5.230,0.000,0.000,409.645
2000-01-31,1444,03037B10,0.0,0.057,-5.026,0.618,4.887,4.887,0.0,0.0,-1.028,-0.59,...,-1.762,-0.21,-1.496,0.000,0.000,0.000,-0.468,-0.590,0.000,1.747
2000-01-31,1562,02968310,0.0,1.462,-4.370,39.833,22.764,21.559,0.0,0.0,1.489,0.00,...,0.234,0.07,1.980,82.220,82.220,0.000,0.150,0.000,0.000,76.112
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-03-31,14225,31190010,0.0,55.700,-24.000,1030.800,574.800,573.500,0.0,0.0,298.700,0.00,...,393.900,0.52,393.100,1959.400,1959.400,0.000,94.400,0.000,0.000,1520.800
2025-03-31,29466,31868710,0.0,0.642,-40.575,2.488,2.426,2.426,0.0,0.0,1.692,0.00,...,9.089,0.71,2.253,9658.000,25.595,0.000,0.561,0.000,0.000,11.256
2025-03-31,34132,06652N10,0.0,4.197,53.154,9.600,9.552,9.422,0.0,0.0,10.336,0.00,...,16.473,1.10,13.713,9658.000,32.195,0.000,3.377,0.000,0.000,15.722
2025-03-31,62243,91329010,0.0,0.693,-14.338,14.867,10.247,10.054,0.0,0.0,11.598,0.00,...,18.507,1.15,15.424,9658.000,42.902,0.000,3.826,0.000,0.000,24.395


In [20]:
save_sanitized_csv(daily_data_final, 'Targets/daily_crsp_cusip.csv')

Sanitized data saved to: Targets/daily_crsp_cusip_sanitized.csv


In [21]:
save_sanitized_csv(firms_final, 'Predictors/CompFirmCharac_cusip.csv')

Sanitized data saved to: Predictors/CompFirmCharac_cusip_sanitized.csv
