# Data Cleaning
We need to clean up these parquet files before we can use them for training the neural network. On a high level, we need to:
1. Figure out the linking strategy between impressions and conversions. I.e. which impressions lead to which conversions.
2. Ingest the data into a torch dataset.
   1. Remove unused or underused columns.
   2. Handle missing values
   3. Rename columns to be more descriptive.
   4. Parse any columns that need to be parsed.(eg. user-agent strings)
   5. Finally, think about what feature engineering needs to be done.

In [3]:
import pandas as pd

# Configure pandas display options for better readability
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000) # Adjust width for better table display if needed

impressions_path = './data/test_dataset/impressions_test/'
conversions_path = './data/test_dataset/conversions_test/'
device_types_path = './data/data_dictionary/device_types.csv'

# Load the impressions dataset
# Note: Parquet datasets can be stored as directories. Pandas reads them correctly.
df_impressions = pd.read_parquet(impressions_path)
df_conversions = pd.read_parquet(conversions_path)

# df_conversions[['imp_click_dttm_utc', 'conv_dttm_utc']].head()

df_impressions = df_impressions.drop(columns=[col for col in df_impressions.columns if col.startswith('aip')])
df_conversions = df_conversions.drop(columns=[col for col in df_conversions.columns if col.startswith('aip')])



## 1. Identify the linking strategy between impressions and conversions.
Claritas has updated the dataset to include a `unique_id` column on both the impressions and conversions datasets. This should make the linking strategy a lot easier.

DEPRECATED: I think the best way to do this is to look at the `imp_click_dttm_utc` and `conv_dttm_utc` columns in conjunction with the `campaign_id`, `placement_id`, and `imp_click_campaign_id`, `imp_click_placement_id` columns. I think that this should be unique, but I'm not sure at the moment.

In [4]:
import pandas as pd

# Assuming df_impressions and df_conversions are already loaded with the new data

# --- Linking using Unique IDs ---
# Perform a left join using the unique IDs
df_merged_final = pd.merge(
    df_impressions,
    df_conversions[['imp_click_unique_id', 'conv_dttm_utc', 'goal_id', 'goal_name']], # Select relevant conversion cols + the key
    left_on='unique_id',
    right_on='imp_click_unique_id',
    how='left'
)

# Create the conversion flag based on successful merge
# Check if a column from the right dataframe (conversions) is not null. 'conv_dttm_utc' is a good choice.
df_merged_final['conversion_flag'] = (~df_merged_final['conv_dttm_utc'].isnull()).astype(int)

# Optional: Drop the redundant ID column from the conversions table if desired
# df_merged_final = df_merged_final.drop(columns=['imp_click_unique_id'])

# --- Verification ---
print("--- Merged Data Info ---")
df_merged_final.info()

print(f"\nNumber of impressions successfully linked to a conversion: {df_merged_final['conversion_flag'].sum()}")

print("\n--- Sample of Merged Data ---")
# Display relevant columns to check the merge
display(df_merged_final[['unique_id', 'dttm_utc', 'conv_dttm_utc', 'goal_name', 'conversion_flag']].head())

# Check a few linked rows
print("\n--- Sample of Linked Rows (conversion_flag == 1) ---")
display(df_merged_final[df_merged_final['conversion_flag'] == 1][['unique_id', 'dttm_utc', 'conv_dttm_utc', 'goal_name', 'conversion_flag']].head())

--- Merged Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3845798 entries, 0 to 3845797
Data columns (total 17 columns):
 #   Column               Dtype         
---  ------               -----         
 0   placement_id         int64         
 1   dttm_utc             datetime64[ns]
 2   cnxn_type            object        
 3   user_agent           object        
 4   dma                  int32         
 5   country              object        
 6   os                   object        
 7   prizm_premier_code   object        
 8   device_type          object        
 9   unique_id            object        
 10  campaign_id          category      
 11  dte                  category      
 12  imp_click_unique_id  object        
 13  conv_dttm_utc        datetime64[ns]
 14  goal_id              float64       
 15  goal_name            object        
 16  conversion_flag      int64         
dtypes: category(2), datetime64[ns](2), float64(1), int32(1), int64(2), object(9)
m

Unnamed: 0,unique_id,dttm_utc,conv_dttm_utc,goal_name,conversion_flag
0,b0eea5e2-0f98-4609-b6c1-141118980e82,2025-01-02 14:49:33,NaT,,0
1,b5352d12-7098-4378-929f-51db0bf40d8f,2025-01-02 18:48:43,NaT,,0
2,75d06ce4-d3a2-4ec8-94ea-42c0ffaf6ed2,2025-01-02 11:14:21,NaT,,0
3,b715b1c9-b150-4fbb-a5df-c02b4e252c9e,2025-01-02 03:12:29,NaT,,0
4,43f8dc0d-3aee-4b7d-8b11-38bb9c778671,2025-01-02 18:13:09,NaT,,0



--- Sample of Linked Rows (conversion_flag == 1) ---


Unnamed: 0,unique_id,dttm_utc,conv_dttm_utc,goal_name,conversion_flag
92,f9f72230-c174-40fe-b04d-a726297952d5,2025-01-02 22:38:19,2025-01-14 20:50:01,lead,1
93,f9f72230-c174-40fe-b04d-a726297952d5,2025-01-02 22:38:19,2025-01-14 15:09:29,lead,1
1248,164dc6fb-9af1-4bf0-8995-5947581c5e97,2025-01-02 20:42:42,2025-01-09 19:33:51,lead,1
1249,164dc6fb-9af1-4bf0-8995-5947581c5e97,2025-01-02 20:42:42,2025-01-09 19:32:10,lead,1
1250,164dc6fb-9af1-4bf0-8995-5947581c5e97,2025-01-02 20:42:42,2025-01-09 19:32:25,lead,1


In [5]:
import pandas as pd

# Assuming df_impressions and df_conversions are loaded

# --- Sanity Check: ID Existence ---

# Get the unique IDs from conversions
conversion_ids = df_conversions['imp_click_unique_id'].unique()
print(f"Number of unique imp_click_unique_id in conversions: {len(conversion_ids)}")
# Display a few to check format
print("Sample conversion IDs:", conversion_ids[:5])


# Check which unique_ids from impressions are present in the conversion IDs
matching_mask = df_impressions['unique_id'].isin(conversion_ids)
num_matches_found = matching_mask.sum()

print(f"\nNumber of impression unique_ids found within conversion imp_click_unique_ids: {num_matches_found}")

# If matches were found, show a few of the matching IDs from impressions
if num_matches_found > 0:
    matching_impression_ids = df_impressions.loc[matching_mask, 'unique_id'].unique()
    print("\nSample of matching unique_ids found in impressions:")
    print(matching_impression_ids[:10]) # Show up to 10 matching IDs
else:
    print("\nNo unique_ids from the impressions dataset were found in the conversions dataset's imp_click_unique_id column.")



Number of unique imp_click_unique_id in conversions: 7827
Sample conversion IDs: ['127f8d07-ca50-4e34-af4f-1e10628d8f99'
 '61c40d8a-8540-433a-951e-e295c90957a4'
 '77eb1ab1-354e-440a-aa5e-ecdde7f11fac'
 'de677643-3e11-41a9-b332-f2e3efd825f6'
 'dd0fd289-8e3a-4508-b3f8-839cf2e322e3']

Number of impression unique_ids found within conversion imp_click_unique_ids: 7775

Sample of matching unique_ids found in impressions:
['f9f72230-c174-40fe-b04d-a726297952d5'
 '164dc6fb-9af1-4bf0-8995-5947581c5e97'
 '54196480-920d-4aba-9121-f8d965b222b7'
 '6dd7c964-fc02-4ac1-a103-051831c4d473'
 '3b64a814-ddc7-4263-bffd-fc17bdc9c3a4'
 'b0f44e31-9af3-4ca3-b5df-82f8b0eed663'
 'dfc7af41-18a3-46eb-a46e-9792edb780dc'
 '0faf1ba0-8717-4c76-8676-6cfacb2d8e64'
 '6cb54575-6ad6-4fb7-b785-1ea70a9dce7a'
 'a317be69-c986-4263-9f68-6bcc622f6bac']
