In [None]:
import pandas as pd
import os
import glob
from pathlib import Path

In [None]:
# Define the directory containing the CSV files
pdb_reports_dir = '/home/markus/MPI_local/data/PDB_reports/2'

# Get all CSV files in the directory, sorted to maintain order
csv_files = sorted(glob.glob(os.path.join(pdb_reports_dir, '*.csv')))

print(f"Found {len(csv_files)} CSV files in {pdb_reports_dir}")
for file in csv_files:
    print(f"  - {os.path.basename(file)}")

In [None]:
# Read all CSV files and combine them into a single dataframe
combined_df = pd.DataFrame()

for file_path in csv_files:
    print(f"Reading {os.path.basename(file_path)}...")
    
    # Read each CSV file, skipping first row and using second row as headers
    df = pd.read_csv(file_path, skiprows=1, header=0)
    
    # Append to the combined dataframe
    combined_df = pd.concat([combined_df, df], ignore_index=True)

print(f"\nCombined dataframe shape: {combined_df.shape}")
print(f"Columns: {list(combined_df.columns)}")

# Check for missing values in key columns
key_columns = ['Entry ID', 'Deposition Date', 'Release Date', 'PDB ID']
print("\nMissing values in key columns:")
for col in key_columns:
    if col in combined_df.columns:
        missing_count = combined_df[col].isna().sum()
        print(f"{col}: {missing_count} missing values")
    else:
        print(f"{col}: Column not found")

In [None]:
# Fill missing values in key columns using forward fill
# This fills each missing value with the last valid value that appeared before it
print("Filling missing values...")

# Make a copy of the dataframe for processing
processed_df = combined_df.copy()

# Define the columns that need forward filling
columns_to_fill = ['Entry ID', 'Deposition Date', 'Release Date', 'PDB ID']

# Apply forward fill to each column
for col in columns_to_fill:
    if col in processed_df.columns:
        # forward fill
        processed_df[col] = processed_df[col].ffill()

print("Missing values after filling:")
for col in columns_to_fill:
    if col in processed_df.columns:
        missing_count = processed_df[col].isna().sum()
        print(f"{col}: {missing_count} missing values")

# Display a sample of the processed data
print("\nSample of processed data:")
print(processed_df[columns_to_fill].head(10))

In [None]:
# Save the processed dataframe to a new CSV file
output_file = os.path.join(pdb_reports_dir, 'combined_pdb_reports_processed.csv')

print(f"Saving processed data to: {output_file}")
processed_df.to_csv(output_file, index=False)

print(f"Successfully saved {len(processed_df)} rows to {output_file}")
print(f"Final dataframe shape: {processed_df.shape}")