# Data Cleaning
We need to clean up these parquet files before we can use them for training the neural network. On a high level, we need to:
1. Figure out the linking strategy between impressions and conversions. I.e. which impressions lead to which conversions.
2. Ingest the data into a torch dataset.
   1. Remove unused or underused columns.
   2. Handle missing values
   3. Rename columns to be more descriptive.
   4. Parse any columns that need to be parsed.(eg. user-agent strings)
   5. Finally, think about what feature engineering needs to be done.

In [19]:
import pandas as pd

# Configure pandas display options for better readability
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000) # Adjust width for better table display if needed

impressions_path = './data/test_dataset/impressions_test.pqt/'
conversions_path = './data/test_dataset/conversions_test.pqt/'
device_types_path = './data/data_dictionary/device_types.csv'

# Load the impressions dataset
# Note: Parquet datasets can be stored as directories. Pandas reads them correctly.
df_impressions = pd.read_parquet(impressions_path)
df_conversions = pd.read_parquet(conversions_path)

df_conversions[['imp_click_dttm_utc', 'conv_dttm_utc']].head()

df_impressions = df_impressions.drop(columns=[col for col in df_impressions.columns if col.startswith('aip')])
df_conversions = df_conversions.drop(columns=[col for col in df_conversions.columns if col.startswith('aip')])



## 1. Identify the linking strategy between impressions and conversions.
I think the best way to do this is to look at the `imp_click_dttm_utc` and `conv_dttm_utc` columns in conjunction with the `campaign_id`, `placement_id`, and `imp_click_campaign_id`, `imp_click_placement_id` columns. I think that this should be unique, but I'm not sure at the moment.

In [28]:
# deal with timestamp precision
df_impressions['dttm_utc_sec'] = df_impressions['dttm_utc'].dt.round('s')
df_conversions['imp_click_dttm_utc_sec'] = df_conversions['imp_click_dttm_utc'].dt.round('s')

# perform a left join on impressions and conversions
df_merged_correct = pd.merge(
        df_impressions,
        df_conversions,
        left_on=['dttm_utc', 'campaign_id', 'placement_id'],
        right_on=['imp_click_dttm_utc', 'imp_click_campaign_id', 'imp_click_placement_id'],
        how='inner'
    )

df_merged_correct.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 0 entries
Data columns (total 26 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   dte_x                    0 non-null      object        
 1   campaign_id              0 non-null      int64         
 2   placement_id             0 non-null      int64         
 3   dttm_utc                 0 non-null      datetime64[ns]
 4   cnxn_type                0 non-null      object        
 5   user_agent               0 non-null      object        
 6   dma                      0 non-null      int32         
 7   country                  0 non-null      object        
 8   os                       0 non-null      object        
 9   prizm_premier_code       0 non-null      object        
 10  device_type              0 non-null      object        
 11  dttm_utc_sec             0 non-null      datetime64[ns]
 12  dte_y                    0 non-null      object 

In [38]:
# Get unique Campaign/Placement ID pairs from impressions
imp_ids = df_impressions[['campaign_id', 'placement_id']].drop_duplicates()
print(f"Unique (Campaign, Placement) pairs in Impressions: {len(imp_ids)}")
print(imp_ids.head())

# Get unique Campaign/Placement ID pairs from conversions (using the attribution columns)
conv_ids = df_conversions[['imp_click_campaign_id', 'imp_click_placement_id']].drop_duplicates()
print(f"\nUnique (Campaign, Placement) pairs attributed in Conversions: {len(conv_ids)}")
print(conv_ids.head())

# Perform an INNER merge *only* on these IDs to see if there's *any* overlap
df_id_overlap = pd.merge(
    imp_ids,
    conv_ids,
    left_on=['campaign_id', 'placement_id'],
    right_on=['imp_click_campaign_id', 'imp_click_placement_id'],
    how='inner'
)

print(f"\nNumber of overlapping ID pairs found: {len(df_id_overlap)}")
display(df_id_overlap)


Unique (Campaign, Placement) pairs in Impressions: 2
    campaign_id  placement_id
0          9317        596772
31         9317        596771

Unique (Campaign, Placement) pairs attributed in Conversions: 2
    imp_click_campaign_id  imp_click_placement_id
0                    9317                  596772
12                   9317                  596771

Number of overlapping ID pairs found: 2


Unnamed: 0,campaign_id,placement_id,imp_click_campaign_id,imp_click_placement_id
0,9317,596772,9317,596772
1,9317,596771,9317,596771


In [42]:
# Ensure both dataframes are sorted by the timestamp you want to merge on
df_impressions_sorted = df_impressions.sort_values('dttm_utc')
# Use the original, unrounded timestamp for conversions here for precision
df_conversions_sorted = df_conversions.sort_values('imp_click_dttm_utc')

# Rename conversion ID columns for clarity in merge_asof
# (Make sure you haven't already renamed these in df_conversions)
if 'imp_click_campaign_id' in df_conversions_sorted.columns:
    df_conversions_sorted = df_conversions_sorted.rename(columns={
        'imp_click_campaign_id': 'campaign_id',
        'imp_click_placement_id': 'placement_id'
    })


# --- Regenerate the 70 matches ---
# Ensure sorted dataframes and renamed columns are available from previous step
# df_impressions_sorted, df_conversions_sorted (with renamed IDs)

try:
    df_merged_asof_large_tolerance = pd.merge_asof(
        df_conversions_sorted, # Left dataframe (conversions)
        df_impressions_sorted, # Right dataframe (impressions)
        left_on='imp_click_dttm_utc', # Time column from left
        right_on='dttm_utc',         # Time column from right
        by=['campaign_id', 'placement_id'], # Exact match columns
        direction='nearest',         # Find nearest time match
        tolerance=pd.Timedelta('6000 minutes') # Large tolerance
    )

    # Filter out rows where merge_asof didn't find a match within tolerance
    df_linked_6000min = df_merged_asof_large_tolerance.dropna(subset=['dttm_utc'])

    if not df_linked_6000min.empty:
        # Calculate the absolute time difference
        df_linked_6000min['timestamp_diff'] = (df_linked_6000min['imp_click_dttm_utc'] - df_linked_6000min['dttm_utc']).abs()

        # Calculate and display statistics of the difference
        print("\n--- Timestamp Difference Stats (6000 min tolerance matches) ---")
        print(f"Number of linked events: {len(df_linked_6000min)}")
        print("Timestamp Difference (imp_click_dttm_utc vs dttm_utc):")
        display(df_linked_6000min['timestamp_diff'].describe())

        # Display a few rows showing the difference
        display(df_linked_6000min[['campaign_id', 'placement_id', 'imp_click_dttm_utc', 'dttm_utc', 'timestamp_diff']].head())
    else:
        print("\nNo rows linked even with 6000 min tolerance.")

except KeyError as e:
    print(f"\nMerge_asof failed. Check column names carefully: {e}")
except Exception as e:
     print(f"\nAn error occurred during merge_asof: {e}")



--- Timestamp Difference Stats (6000 min tolerance matches) ---
Number of linked events: 70
Timestamp Difference (imp_click_dttm_utc vs dttm_utc):


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_linked_6000min['timestamp_diff'] = (df_linked_6000min['imp_click_dttm_utc'] - df_linked_6000min['dttm_utc']).abs()


count                           70
mean     1 days 00:04:23.114285714
std      1 days 04:15:04.476171219
min                0 days 00:00:14
25%                0 days 00:03:11
50%                0 days 09:08:03
75%                1 days 15:02:31
max                4 days 02:45:33
Name: timestamp_diff, dtype: object

Unnamed: 0,campaign_id,placement_id,imp_click_dttm_utc,dttm_utc,timestamp_diff
30,9317,596772,2025-04-04 21:42:19,2025-04-09 00:27:52,4 days 02:45:33
31,9317,596772,2025-04-04 22:23:34,2025-04-09 00:27:52,4 days 02:04:18
32,9317,596772,2025-04-05 00:31:43,2025-04-09 00:27:52,3 days 23:56:09
33,9317,596772,2025-04-05 21:17:21,2025-04-09 00:27:52,3 days 03:10:31
34,9317,596772,2025-04-05 21:34:48,2025-04-09 00:27:52,3 days 02:53:04


In [43]:
# --- Regenerate the 6 matches ---
# Ensure sorted dataframes and renamed columns are available
# df_impressions_sorted, df_conversions_sorted (with renamed IDs)

try:
    df_merged_asof_small_tolerance = pd.merge_asof(
        df_conversions_sorted, # Left dataframe (conversions)
        df_impressions_sorted, # Right dataframe (impressions)
        left_on='imp_click_dttm_utc', # Time column from left
        right_on='dttm_utc',         # Time column from right
        by=['campaign_id', 'placement_id'], # Exact match columns
        direction='nearest',         # Find nearest time match
        tolerance=pd.Timedelta('1 minute') # Small tolerance
    )

    # Filter out rows where merge_asof didn't find a match within tolerance
    df_linked_1min = df_merged_asof_small_tolerance.dropna(subset=['dttm_utc'])

    if not df_linked_1min.empty:
        print("\n--- Inspecting User Columns (1 min tolerance matches) ---")
        print(f"Number of linked events: {len(df_linked_1min)}")

        # Define columns to display - adjust suffixes if needed based on your actual merge result
        # Check df_linked_1min.columns to confirm exact names
        cols_to_display = [
            'imp_click_dttm_utc', 'dttm_utc', # Timestamps
            'conv_user_agent', 'user_agent', # User Agents (check suffixes if merge added them)
            'conv_dma', 'dma',             # DMAs (check suffixes)
            'conv_prizm_premier_code', 'prizm_premier_code', # Prizm codes (check suffixes)
            'conv_cnxn_type', 'cnxn_type',       # Connection types (check suffixes)
            'conv_device_type', 'device_type'    # Device types (check suffixes)
        ]

        # Filter out columns that might not exist (e.g., if suffixes weren't added)
        cols_exist = [col for col in cols_to_display if col in df_linked_1min.columns]

        display(df_linked_1min[cols_exist])
    else:
        print("\nNo rows linked with 1 min tolerance.")


except KeyError as e:
    print(f"\nMerge_asof failed. Check column names carefully: {e}")
except Exception as e:
     print(f"\nAn error occurred during merge_asof: {e}")


--- Inspecting User Columns (1 min tolerance matches) ---
Number of linked events: 6


Unnamed: 0,imp_click_dttm_utc,dttm_utc,conv_user_agent,user_agent,conv_dma,dma,conv_prizm_premier_code,prizm_premier_code,conv_cnxn_type,cnxn_type,conv_device_type,device_type
76,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
77,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
78,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
79,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
80,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
81,2025-04-09 00:33:15,2025-04-09 00:33:29,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7...,Mozilla/5.0 (iPhone; CPU iPhone OS 18_3_2 like...,602,602.0,12,25,Cable/DSL,Cable/DSL,d,p
