In [None]:
import os
import pandas as pd

# List all files in the specified directory
files = os.listdir('../data-clean/tracking/matched/')
files = [f for f in files if f.endswith("csv")]

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['date', 'no_rows', 'no_track_id', 'no_clinic_id', 'count_missings'])

for file in files:
    # Extract the date from the file name (assuming the file name contains the date)
    date = file.replace('.csv', '')
    
    # Read the data into a DataFrame
    df = pd.read_csv(os.path.join('../data-clean/tracking/matched/', file))
    
    # Compute the number of rows
    no_rows = len(df)
    
    # Compute the number of NaNs in the 'track_id' and 'clinic_id' columns
    no_na_track_id = df['track_id'].isna().sum()
    no_na_clinic_id = df['clinic_id'].isna().sum()
    
    # Compute the combined count of missings
    if no_na_track_id + no_na_clinic_id == 0:
        count_missings = 0
    if no_na_track_id > 0:
        count_missings = -no_na_track_id
    else:
        count_missings = no_na_clinic_id
    
    # Create a DataFrame for the current file's results
    new_row = pd.DataFrame({
        'date': [date],
        'no_rows': [no_rows],
        'no_track_id': [no_rows-no_na_track_id],
        'no_clinic_id': [no_rows-no_na_clinic_id],
        'count_missings': [count_missings]
    })
    
    # Concatenate the new row to the results DataFrame
    results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.sort_values(by='date')
#results_df.sort_values(by='date').to_csv("../data-check/matching.csv", index=False)


In [None]:
import matplotlib.pyplot as plt

# Assuming results_df is your DataFrame
# Create a histogram of the 'count_missings' column using matplotlib
plt.hist(results_df['count_missings'], bins=10, edgecolor='black')

# Add titles and labels
plt.title('Histogram of count_missings')
plt.xlabel('count_missings')
plt.ylabel('Frequency')

# Show the plot
plt.show()