In [None]:
import os
import pandas as pd

# List all files in the specified directory
files = os.listdir('../data-clean/tracking/matched/')
files = [f for f in files if f.endswith("csv")]

# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame(columns=['date', 'no_rows', 'no_track_id', 'no_clinic_id', 'count_missings'])

for file in files:
    # Extract the date from the file name (assuming the file name contains the date)
    date = file.replace('.csv', '')
    
    # Read the data into a DataFrame
    df = pd.read_csv(os.path.join('../data-clean/tracking/matched/', file))
    
    # Compute the number of rows
    no_rows = len(df)
    
    # Compute the number of NaNs in the 'track_id' and 'clinic_id' columns
    no_na_track_id = df['track_id'].isna().sum()
    no_na_clinic_id = df['clinic_id'].isna().sum()
    
    # Compute the combined count of missings
    if no_na_track_id + no_na_clinic_id == 0:
        count_missings = 0
    if no_na_track_id > 0:
        count_missings = -no_na_track_id
    else:
        count_missings = no_na_clinic_id
    
    # Create a DataFrame for the current file's results
    new_row = pd.DataFrame({
        'date': [date],
        'no_rows': [no_rows],
        'no_track_id': [no_rows-no_na_track_id],
        'no_clinic_id': [no_rows-no_na_clinic_id],
        'count_missings': [count_missings]
    })
    
    # Concatenate the new row to the results DataFrame
    results_df = pd.concat([results_df, new_row], ignore_index=True)

results_df.sort_values(by='date')
#results_df.sort_values(by='date').to_csv("../data-check/matching.csv", index=False)


In [None]:
import matplotlib.pyplot as plt

# Assuming results_df is your DataFrame
# Create a histogram of the 'count_missings' column using matplotlib
plt.hist(results_df['count_missings'], bins=10, edgecolor='black')

# Add titles and labels
plt.title('Histogram of count_missings')
plt.xlabel('count_missings')
plt.ylabel('Frequency')

# Show the plot
plt.show()

In [None]:
import os
import pandas as pd

def get_tb_tracks(date):
    # Load the CSV files into pandas DataFrames
    tracks_df = pd.read_csv(os.path.join('../data-clean/tracking/unlinked/', f"{date}.csv"))
    links_df = pd.read_csv(os.path.join('../data-clean/tracking/linked-tb/', f"{date}.csv"))
    clinic_df = pd.read_csv('../data-clean/clinical/tb_cases.csv')

    # Filter clinic_df by the given date
    clinic_df = clinic_df[clinic_df['date'] == date]

    # Left join tracks_df with links_df on 'track_id'
    tracks_df = tracks_df.merge(links_df, on='track_id', how='left')

    # Filter tracks_df where 'sure_tb' is True
    tb_tracks = tracks_df[tracks_df['sure_tb'] == True]

    # Define a function to find the longest uninterrupted period of 'in_tb_pat' being True
    def find_longest_period(df):
        df = df.sort_values(by='time')
        df['period'] = (df['in_tb_pat'] != df['in_tb_pat'].shift()).cumsum()
        periods = df[df['in_tb_pat']].groupby('period').agg({'time': ['min', 'max']})
        periods.columns = ['start_time', 'end_time']
        periods['duration'] = periods['end_time'] - periods['start_time']
        longest_period = periods.loc[periods['duration'].idxmax()]
        return longest_period['start_time']

    # Apply the function to find the longest period for each 'new_track_id'
    tb_tracks = tb_tracks.groupby('new_track_id').apply(find_longest_period).reset_index()
    tb_tracks.columns = ['new_track_id', 'start_time']

    # Convert the Unix timestamp to a proper timestamp
    tb_tracks['start_time'] = pd.to_datetime(tb_tracks['start_time'], unit='ms')

    # Ensure the number of rows in tb_tracks equals the number of rows in clinic_df
    assert len(tb_tracks) == len(clinic_df), "Number of rows in tb_tracks does not match clinic_df"

    # Column bind tb_tracks and clinic_df
    combined_df = pd.concat([tb_tracks, clinic_df.reset_index(drop=True)], axis=1)

    # Merge combined_df with tracks_df on 'new_track_id'
    tracks_df = tracks_df.merge(combined_df[['new_track_id', 'clinic_id']], on='new_track_id', how='left')

    # Remove tracks without a clinic_id
    tracks_df = tracks_df.dropna(subset=['clinic_id'])

    return combined_df, tracks_df

# Example usage
date = '2024-06-20'
combined_df, tracks_df = get_tb_tracks(date)

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

def plot_tracks(tracks_df):
    # Ensure the time column is in datetime format
    tracks_df['time'] = pd.to_datetime(tracks_df['time'], unit='ms')

    # Get unique new_track_ids
    unique_ids = tracks_df['new_track_id'].unique()

    # Create a figure and axis
    fig, ax = plt.subplots(figsize=(12, 8))

    # Plot each new_track_id
    for i, track_id in enumerate(unique_ids):
        # Filter the DataFrame for the current track_id
        track_data = tracks_df[tracks_df['new_track_id'] == track_id]

        # Plot horizontal lines for periods where in_tb_pat is True
        for _, row in track_data.iterrows():
            if row['in_tb_pat']:
                ax.plot([row['time'], row['time']], [i, i], marker='|', markersize=10, color='blue')

    # Set y-axis ticks and labels
    ax.set_yticks(range(len(unique_ids)))
    ax.set_yticklabels(unique_ids)

    # Set labels and title
    ax.set_xlabel('Time')
    ax.set_ylabel('Track ID')
    ax.set_title('Tracks in TB Patient')

    # Show grid
    ax.grid(True)

    # Show the plot
    plt.show()

# Example usage
plot_tracks(tracks_df)