In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_012.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_010.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_001.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_008.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_005.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_011.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_007.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_009.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_014.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_003.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_002.csv
/kaggle/input/pumpfun-30s-september-2025/september_2025_first30s_chunk_004.csv
/kaggle/input/pumpfun-30s-september-2025/september_2

# Phase 1: Data Integration and Target Identification
This phase focuses on preparing the master transaction dataset (df_master) for modeling by integrating the separate target token list (target_df) and creating the essential target variable.

**Step 1**: Data Preparation (df_master.drop(columns=['index'], inplace=True))
Action: I removed the column named 'index' from the main transaction DataFrame, df_master.

Purpose: The 'index' column in this context likely represents a redundant row ID or sequence number from the original chunk files. It is removed to clean the dataset and ensure it doesn't accidentally interfere with the modeling process (e.g., being treated as a meaningful feature) or take up unnecessary memory.

**Step 2** : Extracting Target Token Addresses (target_ids = set(target_df['Target Token Addresses']))
Action: I extracted all unique token addresses from the 'Target Token Addresses' column of the smaller target_df (the list of tokens bought by the target wallet). These addresses were stored as a Python set named target_ids.

Purpose: Using a set is crucial here because it provides highly optimized performance for checking membership. This ensures that the next step—checking if a token exists in this list—is executed as quickly and efficiently as possible across the millions of rows in the df_master DataFrame.

**Step 3**: Creating the Binary Target Variable (df_master['is_target'] = df_master['mint_token_id'].isin(target_ids).astype(int))
Action: I created a new binary column, is_target, in the main transaction DataFrame (df_master). This column is the prediction target for the classification task.

Mechanism:

It checks every token's unique ID (mint_token_id) in df_master.

The .isin(target_ids) method returns a Boolean series (True/False) indicating whether the mint_token_id is present in the target_ids set (i.e., whether the token is a "target" token).

The .astype(int) method then converts this Boolean series into an integer series, where:

1 (True) means the token is a target token.

0 (False) means the token is not a target token.

Result: The df_master DataFrame is now enriched with the is_target column, transforming the raw transaction data into a supervised learning dataset ready for feature engineering and model training.

In [2]:
import pandas as pd
import os

# --- Configuration (Assuming 'DATA_DIR' is defined in the surrounding script) ---

# WARNING: The provided code snippet uses 'DATA_DIR' in the loop, but defined 'DATA_DIR_test'.
# Assuming 'DATA_DIR' (the path to the files) is the correct variable to use for the files.
# If DATA_DIR_test is the intended path, the file_paths_test list creation needs fixing.
DATA_DIR_test = "/kaggle/input/alpha-radar-solana-sprint/"
DATA_DIR = DATA_DIR_test # Use a single, consistent path variable for the file list creation

# List of file names to load (5 chunks of the evaluation set)
file_names = [
    "evaluation_set_30s_chunk_001.csv",
    "evaluation_set_30s_chunk_002.csv",
    "evaluation_set_30s_chunk_003.csv",
    "evaluation_set_30s_chunk_004.csv",
    "evaluation_set_30s_chunk_005.csv"
]

file_paths_eval = [os.path.join(DATA_DIR, f) for f in file_names]

# List to hold individual DataFrames with test-specific names
dfs_list_eval = []
df_names_eval = ['eval_chunk1_df', 'eval_chunk2_df', 'eval_chunk3_df', 'eval_chunk4_df', 'eval_chunk5_df']

print("Starting data loading and naming individual evaluation DataFrames...")

# --- Step 1: Load and Store Individual Chunks ---

# Loop through the file paths and load them, assigning easy names
for path, name in zip(file_paths_eval, df_names_eval):
    try:
        df_chunk = pd.read_csv(path)
        
        # Create variables with dynamic names (e.g., eval_chunk1_df)
        # We explicitly name them as requested, though a dictionary is generally safer.
        exec(f"{name} = df_chunk")
        
        dfs_list_eval.append(df_chunk)
        print(f"Loaded {path.split('/')[-1]} and stored in variable '{name}' with {len(df_chunk)} rows.")
        
    except FileNotFoundError:
        print(f"ERROR: File not found at {path}. Please check the path.")

# --- Step 2: Concatenate all chunks for a unified Evaluation dataset ---

# Concatenate all evaluation chunks into a single DataFrame, named 'df_eval'
df_test = pd.concat(dfs_list_eval, ignore_index=True)
print(f"\nAll chunks concatenated into the **evaluation master DataFrame ('df_eval')**. Total rows: **{len(df_test)}**.")

Starting data loading and naming individual evaluation DataFrames...
Loaded evaluation_set_30s_chunk_001.csv and stored in variable 'eval_chunk1_df' with 290810 rows.
Loaded evaluation_set_30s_chunk_002.csv and stored in variable 'eval_chunk2_df' with 290810 rows.
Loaded evaluation_set_30s_chunk_003.csv and stored in variable 'eval_chunk3_df' with 290810 rows.
Loaded evaluation_set_30s_chunk_004.csv and stored in variable 'eval_chunk4_df' with 290810 rows.
Loaded evaluation_set_30s_chunk_005.csv and stored in variable 'eval_chunk5_df' with 252345 rows.

All chunks concatenated into the **evaluation master DataFrame ('df_eval')**. Total rows: **1415585**.


In [3]:
import pandas as pd
import os 

# Define the file path for the target dataset on Kaggle
file_path = '/kaggle/input/alpha-radar-tokens/Alpha Radar Target Tokens.csv'

# Read the CSV file and store the DataFrame under the name 'target_df'
try:
    target_df = pd.read_csv(file_path)

    # Display the first few rows to confirm successful loading
    print("Target dataset successfully read into 'target_df'.")
    print("\nFirst 5 rows:")
    print(target_df.head())

    # Display the dimensions of the dataset (rows, columns)
    print("\nDataset shape (rows, columns):", target_df.shape)

except FileNotFoundError:
    print(f"ERROR: File path not found. Please ensure the path '{file_path}' is correct and you are running in a Kaggle environment.")
except Exception as e:
    print(f"An error occurred while reading the file: {e}")

Target dataset successfully read into 'target_df'.

First 5 rows:
                         Target Token Addresses
0  12ervyBwQpyhnpfNX9zUxpz66A2JbgZ7396uAxHMpump
1  12exjzy4GStSzfpS8pKwXCkPL4TGo5Jf8zPykc5spump
2  12iraS1RkG58SDb3upY3VsCPXfm7VJ4MZ7inDrsvpump
3  12s4BD7z6bFEiz1JLjXKcK6QpG9ZfajoBAQqD66Jpump
4  12T4SMPYZ2j3ge5JUh9P1huVMJAzZhWZCeKUrtwcpump

Dataset shape (rows, columns): (9189, 1)


In [4]:
import pandas as pd
import os

DATA_DIR = "/kaggle/input/pumpfun-30s-september-2025"

# List of file names to load (only the 5 chunks you provided)
file_paths = [
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_001.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_002.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_003.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_004.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_005.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_006.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_007.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_008.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_009.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_010.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_011.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_012.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_013.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_014.csv"),
    os.path.join(DATA_DIR, "september_2025_first30s_chunk_015.csv"),
]

# List to hold individual DataFrames with easy variable names
dfs_list = []
df_names = ['sf1_df', 'sf2_df', 'sf3_df', 'sf4_df', 'sf5_df', 'sf6_df', 'sf7_df', 'sf8_df', 'sf9_df', 'sf10_df', 'sf11_df', 'sf12_df', 'sf13_df', 'sf14_df', 'sf15_df']

print("Starting data loading and naming individual DataFrames...")

# Loop through the file paths and load them, assigning easy names
for path, name in zip(file_paths, df_names):
    try:
        df_chunk = pd.read_csv(path)
        # Use exec() to create variables with dynamic names (e.g., es1_df = pd.read_csv(...))
        # Note: In production code, using a dictionary is often cleaner than exec, 
        # but for specific naming requests, this works.
        exec(f"{name} = df_chunk")
        dfs_list.append(df_chunk)
        print(f"Loaded {path.split('/')[-1]} and stored in variable '{name}' with {len(df_chunk)} rows.")
    except FileNotFoundError:
        print(f"ERROR: File not found at {path}. Please check the path.")

# --- Step 2: Concatenate all chunks for a unified dataset ---

# Concatenate all transaction chunks into a single master DataFrame
df_master = pd.concat(dfs_list, ignore_index=True)
print(f"\nAll chunks concatenated into 'df_master'. Total rows: {len(df_master)}.")

# --- Step 3: Initial Check (Optional but recommended) ---

# Verify one of the easy-named DFs and the master DF
print(f"\nFirst 5 rows of {df_names[0]}:")
print(eval(df_names[0]).head()) # Using eval to access the dynamically created variable

print("\nFirst 5 rows of the master DataFrame (df_master):")
print(df_master.head())

Starting data loading and naming individual DataFrames...
Loaded september_2025_first30s_chunk_001.csv and stored in variable 'sf1_df' with 719026 rows.
Loaded september_2025_first30s_chunk_002.csv and stored in variable 'sf2_df' with 800000 rows.
Loaded september_2025_first30s_chunk_003.csv and stored in variable 'sf3_df' with 640000 rows.
Loaded september_2025_first30s_chunk_004.csv and stored in variable 'sf4_df' with 751145 rows.
Loaded september_2025_first30s_chunk_005.csv and stored in variable 'sf5_df' with 640000 rows.
Loaded september_2025_first30s_chunk_006.csv and stored in variable 'sf6_df' with 640000 rows.
Loaded september_2025_first30s_chunk_007.csv and stored in variable 'sf7_df' with 640000 rows.
Loaded september_2025_first30s_chunk_008.csv and stored in variable 'sf8_df' with 544354 rows.
Loaded september_2025_first30s_chunk_009.csv and stored in variable 'sf9_df' with 620691 rows.
Loaded september_2025_first30s_chunk_010.csv and stored in variable 'sf10_df' with 6400

In [5]:
df_master.drop(columns=['index'], inplace=True)
target_ids = set(target_df['Target Token Addresses'])
df_master['is_target'] = df_master['mint_token_id'].isin(target_ids).astype(int)
print(df_master)

        timestamp                                 mint_token_id  \
0         26:28.7  4shdnB4wA5TA5wphgD1gQFmbaAACtbUyiepLDUoopump   
1         26:28.7  25moJ9zLHE2Q9oisqSMKjkS4XjaGL5J6pP1yftDYpump   
2         26:28.7  DHgTSAVz1bGxqzGGcEX9Zw46siX5kYCMtFcy42mXpump   
3         26:28.7  CHc1G8amTYHTphEm3zWewWi1oULrDHRx3tiAbbRVpump   
4         26:28.8  FKQKMMJAJpMQb93ghC6vXby6ebwimyNDALpjuRX7pump   
...           ...                                           ...   
9478701   52:30.7  J1Ffx7CSdpGt2DWfgV1s7Yutp5i4My1fPX5MyPfuusFM   
9478702   52:30.7  J1Ffx7CSdpGt2DWfgV1s7Yutp5i4My1fPX5MyPfuusFM   
9478703   52:32.0  J1Ffx7CSdpGt2DWfgV1s7Yutp5i4My1fPX5MyPfuusFM   
9478704   52:33.7  J1Ffx7CSdpGt2DWfgV1s7Yutp5i4My1fPX5MyPfuusFM   
9478705   52:33.8  J1Ffx7CSdpGt2DWfgV1s7Yutp5i4My1fPX5MyPfuusFM   

                                               holder trade_mode  \
0        86iX4v3uSvAMWsYHLVzrmKYYstf1ZxevMoe3mMDNVZYE       sell   
1        HD2N5YvYZQiqqocCFj9MgD8WnRM8uCqWxA1UF3FaDzy2      

In [6]:
df_test.head()

Unnamed: 0,timestamp,mint_token_id,holder,trade_mode,token_quantity,creator,creator_fee,creator_fee_pump,market_cap_usd,token_delta,...,volume_oscillator,rate_of_change,money_flow_index,total_holders,current_holders,top10_percent_total,creator_balance,creator_sold,holder_ratio,buy_sell_ratio
0,2025-10-05 00:01:10.846000+00:00,2yswWAzwhgmEzahL3hbFRtniP66t4FL1nJ6byGzGpump,GAQKGRHz82JHoR3VbSHG4a52EMKwsntKTwmZDzEWaGt1,buy,2264273.0,7moqFjvm2MwAiMtCZoqYoTAPzRBxxMRT2ddyHThQuWjr,444375.0,1407188.0,15300.513435,2264273.0,...,0.0,0.0,50.0,1,1,0.226427,0.0,False,1.0,1.75
1,2025-10-05 00:01:10.951000+00:00,2yswWAzwhgmEzahL3hbFRtniP66t4FL1nJ6byGzGpump,HnDfmx7aWcF5Wc1KVYxcE3Edwysv6hWqAQnRVCdJmyGt,sell,-1079758.0,7moqFjvm2MwAiMtCZoqYoTAPzRBxxMRT2ddyHThQuWjr,212266.0,672176.0,15253.442878,-1079758.0,...,0.0,0.0,50.0,2,1,0.226427,0.0,False,2.0,1.4
2,2025-10-05 00:01:11.078000+00:00,FSLykFLox7kbrZqTqV9uRJWstLv31Leqv4acX2crpump,Bp92FxsKUmKTcAVhWtTRnhL7SLy7FH2D7gtoS4Hx9LKM,sell,-1145310.0,HAaeVqkChkaAh68q3LUGeuA5uFF2xvmoQbnvMxFAtU4H,110712.0,350585.0,7503.317913,-1145310.0,...,0.0,0.0,50.0,1,0,0.0,0.0,False,0.0,0.0
3,2025-10-05 00:01:11.339000+00:00,EGi81NSf9QQfWAew73jh6D8YfLuFFkSmf2G2ooGypump,HGGVo1rsELahsumCmieRAg6ojSeuuUwMtenq3cm2uf96,buy,4299319.0,HZe31qur7xdWWZ3Di7wDxDUgxTfHeJ5TTCCsWg237tXp,739885.0,2342968.0,13450.552389,4299319.0,...,0.0,0.0,50.0,1,1,0.429932,0.0,False,1.0,3.0
4,2025-10-05 00:01:11.566000+00:00,843a3JnskkJFFdv3HMwfthvaGdMG61yg54V7THuYpump,HV1KXxWFaSeriyFvXyx48FqG9BoFbfinB8njCJonqP7K,buy,2194562.0,RLz7Uj9QXAsz1ASCP1e6d1GLtPDkksAtorNSEx1XzZr,1630504.0,5163261.0,58089.914546,2194562.0,...,0.0,0.0,50.0,1,1,0.219456,0.0,False,1.0,1.0


# Phase 2: Time Transformation Explanation

The goal of this phase is to convert the non-standard, relative time format in the original timestamp column (e.g., '26:28.7') into an absolute, globally recognizable datetime object by referencing a fixed starting point.

1. Defining the Reference Point (start_date)
This line establishes the "epoch" or initial reference date for the time series. All subsequent relative time measurements will be calculated as an offset from this fixed point in time (September 1, 2025, 00:00:00).

2. Custom Conversion Function (custom_time_to_timedelta)
* Parsing: It splits the input string (e.g., '26:28.7') at the colon (:) into two floating-point numbers: minutes and seconds.
* Calculation: It calculates the total elapsed time in seconds using the formula: $TotalSeconds = (Minutes \times 60) + Seconds$.
* Conversion: It uses pd.to_timedelta() to convert this total elapsed time into a pandas timedelta object (pd.Timedelta). This object represents a duration (e.g., 26 minutes and 28.7 seconds).
* Error Handling: The try...except block ensures that if any row contains a non-parsable format, it returns pd.NaT (Not a Time) instead of crashing the process.

3. Applying the Transformation

* The first line applies the custom function to every entry in the original timestamp column, creating a temporary time_delta column which stores the duration relative to the start.

* The second line performs the core transformation: it adds the calculated duration (time_delta) to the fixed start_date. This results in the final, absolute datetime value in the new_timestamp column.

**The final two lines clean up the DataFrame by:**

Dropping the two temporary columns: the original non-standard timestamp and the intermediate time_delta.

Renaming the newly created, correctly formatted new_timestamp column back to timestamp, ensuring consistency for future feature engineering steps.

In [7]:
start_date = pd.to_datetime('2025-09-01 00:00:00')
def custom_time_to_timedelta(time_str):
    try:

        minutes, seconds = map(float, time_str.split(':'))
  
        total_seconds = (minutes * 60) + seconds
     
        return pd.to_timedelta(total_seconds, unit='S')
    except:
   
        return pd.NaT

df_master['time_delta'] = df_master['timestamp'].apply(custom_time_to_timedelta)

df_master['new_timestamp'] = start_date + df_master['time_delta']

  return pd.to_timedelta(total_seconds, unit='S')


In [8]:
df_master.head()

Unnamed: 0,timestamp,mint_token_id,holder,trade_mode,token_quantity,creator,creator_fee,creator_fee_pump,market_cap_usd,token_delta,...,total_holders,current_holders,top10_percent_total,creator_balance,creator_sold,holder_ratio,buy_sell_ratio,is_target,time_delta,new_timestamp
0,26:28.7,4shdnB4wA5TA5wphgD1gQFmbaAACtbUyiepLDUoopump,86iX4v3uSvAMWsYHLVzrmKYYstf1ZxevMoe3mMDNVZYE,sell,-101489.9,8NJ7Ujpji8uMF2675mqaTSEm2DCbfJA7fiRKtiaqkaLN,5836.0,110875.0,16671.28835,-101489.9,...,1,0,0.0,0.0,False,0.0,1.526786,0,0 days 00:26:28.700000,2025-09-01 00:26:28.700
1,26:28.7,25moJ9zLHE2Q9oisqSMKjkS4XjaGL5J6pP1yftDYpump,HD2N5YvYZQiqqocCFj9MgD8WnRM8uCqWxA1UF3FaDzy2,sell,-2354414.0,EC6F58bJae7rLn1MJomScWe9pj7p94cfHKxtirN1hBrW,70844.0,1346020.0,8697.908552,-2354414.0,...,1,0,0.0,0.0,False,0.0,1.395833,0,0 days 00:26:28.700000,2025-09-01 00:26:28.700
2,26:28.7,DHgTSAVz1bGxqzGGcEX9Zw46siX5kYCMtFcy42mXpump,AL5KUzW7dNEig3MDE8XgPkbhbAS466vfbbQAJE4ymCa9,sell,-204940.3,EsHu56bkASf1d38jt4dLN6QxCCBk5CF4VKifANrFowKn,5900.0,112100.0,8346.483698,-204940.3,...,1,0,0.0,0.0,False,0.0,1.515152,0,0 days 00:26:28.700000,2025-09-01 00:26:28.700
3,26:28.7,CHc1G8amTYHTphEm3zWewWi1oULrDHRx3tiAbbRVpump,DGGu9TY35fFvTZKq34wE6mKMVbM2trXQp8Hg7TnrNhjq,buy,41067020.0,H927REVfidMopK91pZp8AWy26k2vp1HHuv1Zx7Erukhd,750001.0,14250001.0,5533.028652,41067020.0,...,1,1,4e-06,0.0,False,1.0,1.210526,0,0 days 00:26:28.700000,2025-09-01 00:26:28.700
4,26:28.8,FKQKMMJAJpMQb93ghC6vXby6ebwimyNDALpjuRX7pump,BwfQncZ114sFkcdecRELVRyDjc6BrHY9MoK338wLm4qT,sell,-6508975.0,Hu655uJALykrnReosqnGHKWdFGECsRAzHdA5obGhZ24i,100965.0,1918329.0,4469.713497,-6508975.0,...,1,0,0.0,0.0,False,0.0,1.195122,0,0 days 00:26:28.800000,2025-09-01 00:26:28.800


In [9]:
df_master = df_master.drop(columns=['time_delta', 'timestamp'])

In [10]:
df_master = df_master.rename(columns={'new_timestamp': 'timestamp'})

In [11]:
df_master.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9478706 entries, 0 to 9478705
Data columns (total 34 columns):
 #   Column                       Dtype         
---  ------                       -----         
 0   mint_token_id                object        
 1   holder                       object        
 2   trade_mode                   object        
 3   token_quantity               float64       
 4   creator                      object        
 5   creator_fee                  float64       
 6   creator_fee_pump             float64       
 7   market_cap_usd               float64       
 8   token_delta                  float64       
 9   sol_delta                    float64       
 10  buy_count                    int64         
 11  sell_count                   int64         
 12  total_count                  int64         
 13  token_volume                 float64       
 14  sol_volume                   float64       
 15  liquidity_ratio              float64       
 16  

# Phase 3: Core Setup and Global Variables
The goal of this phase is to convert the non-standard, relative time format in the original timestamp column (e.g., '26:28.7') into an absolute, globally recognizable datetime object by referencing a fixed starting point.

**Imports**: Imports necessary libraries: pandas for data handling, numpy for numerical operations, sklearn.model_selection for robust training/validation splits, CatBoostClassifier for modeling, scipy.stats.mstats.winsorize for outlier handling, and tqdm for tracking progress.

**Settings**: Sets Pandas to display all columns ('display.max_columns', None).

**GROUP_KEY**: Defines the central identifier ('mint_token_id') used to group transactions, which is crucial because the prediction target (is_target) is at the token level, not the transaction level.

**1. Base Feature Aggregation (add_base_features)**
This function transforms the transactional data into token-level summary statistics. Since the target is defined per token, this aggregation step is essential.

Feature Creation: It calculates a wide range of aggregate statistics (sum, mean, max, min, std, median) for every numeric column for each unique mint_token_id.

Example: Instead of having thousands of token_quantity entries, you get features like token_quantity__sum, token_quantity__mean, etc., which summarize the token's activity.

Special Handling: consumed_gas gets a restricted set of aggregates (sum, mean, max), likely due to domain knowledge or data distribution.

df.groupby(GROUP_KEY)[col].transform(agg): The use of .transform() is key here. It applies the aggregation to the grouped data but returns a series of the same size as the original DataFrame, allowing the aggregated values to be joined back easily.

Output: df_agg is created by dropping duplicates on GROUP_KEY, resulting in a DataFrame where each row represents a single token with its newly created summary features.

**2. Windowing and Derived Features (add_window_and_post_features)**
This function adds features that capture the time-varying dynamics and derived trading metrics of each token.

Rolling Window Features: This is the most complex part, leveraging the corrected timestamp column.

It first sorts the data by GROUP_KEY and timestamp.

It sets the timestamp as the index, which is required for Pandas' time-based rolling functionality.

It uses WINDOW_SIZES ('5s', '10s', '20s') with groupby(GROUP_KEY)[col].rolling(window=window, closed='left') to calculate metrics (like sum and mean) of the past 5, 10, and 20 seconds before the current transaction. This prevents data leakage.

Finally, since the target is token-level, it merges the last observed window values (df_last_row.drop_duplicates(..., keep='last')) onto the aggregated df_agg.

Post-Aggregation Features: These are calculated after the base aggregation, combining existing summary features to create new, informative ratios and rates:

Volume per Transaction: sol_volume__per_tx (Average SOL volume traded in one transaction).

Activity Rate: total_count__per_sec (Transaction speed).

Logarithmic Transformations: np.log1p() is used on large, skewed variables (like sol_volume__sum) to normalize their distribution, improving model stability and performance.

**3. Cleaning and Transformation (clean_and_transform)**
This function standardizes and prepares the features for the model.

Boolean Conversion: Explicitly converts bool columns to int (0 or 1), which is generally safer, although CatBoost can often handle Booleans directly.

Winsorization: This is a method for outlier trimming.

The function uses winsorize(df[col], limits=[0.01, 0.01]) to cap the values of a column at the 1st percentile (low end) and the 99th percentile (high end). This prevents extreme outliers from disproportionately influencing the model's training process.

Important count/integer columns (buy_count, total_count, etc.) and the target are deliberately excluded from trimming.

**4. CatBoost Training (train_and_predict_catboost)**
This function sets up and executes the machine learning model training process.

Stratified K-Fold: Uses StratifiedKFold to split the training data (df_master_clean) into 5 folds. Stratified means the split maintains the same ratio of the target class (is_target) in each training and validation fold, which is critical for imbalanced data.

Categorical Feature Handling: It automatically detects columns with object or category data types and passes their indices (categorical_features_indices) directly to CatBoost. CatBoost handles these features internally using methods like Ordered Target Encoding, which is highly effective and eliminates the need for manual one-hot encoding.

Model Initialization: Sets up the CatBoostClassifier with standard competition parameters:

iterations=1000, learning_rate=0.03, depth=6.

loss_function='Logloss' (for binary classification) and eval_metric='AUC'.

early_stopping_rounds=100 stops training if performance on the validation set doesn't improve after 100 iterations, saving time.

task_type='GPU' enables GPU acceleration, which is vital for large datasets and complex models.

Prediction:

OOF (Out-Of-Fold) Predictions: Collects predictions for the validation set in each fold. These are used later for robust threshold optimization.

Test Predictions: Collects predictions for the unseen test data (X_test) and averages them across all 5 folds to get a final, more stable prediction (test_preds).

In [12]:
# ==============================================================================
# 0. IMPORTS AND GLOBAL VARIABLES
# ==============================================================================
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from scipy.stats.mstats import winsorize
from tqdm.notebook import tqdm
from sklearn.metrics import jaccard_score, recall_score

pd.set_option('display.max_columns', None)
GROUP_KEY = 'mint_token_id'

# ==============================================================================
# 0. MEMORY OPTIMIZATION FUNCTION (Aynı kalır)
# ==============================================================================
def downcast_dtypes(df):
    """Sütunları en küçük uygun sayısal tipe dönüştürerek bellek kullanımını azaltır."""
    print("Starting memory optimization...")
    start_mem = df.memory_usage(deep=True).sum() / 1024**2
    
    for col in tqdm(df.columns, desc="Downcasting"):
        col_type = df[col].dtype
        
        if col_type == object:
            if len(df[col].unique()) / len(df[col]) < 0.5: # Yüksek kardinalite yoksa category yap
                df[col] = df[col].astype('category')
            continue
        
        if str(col_type).startswith('datetime'):
            continue 

        if str(col_type)[:3] == 'int':
            # Daha agresif downcasting
            if df[col].max() < 128 and df[col].min() > -128:
                df[col] = df[col].astype(np.int8)
            elif df[col].max() < 32768 and df[col].min() > -32768:
                df[col] = df[col].astype(np.int16)
            elif df[col].max() < 2147483648 and df[col].min() > -2147483648:
                df[col] = df[col].astype(np.int32)
        elif str(col_type)[:5] == 'float':
            df[col] = df[col].astype(np.float32)
                
    end_mem = df.memory_usage(deep=True).sum() / 1024**2
    print(f"Memory decreased from {start_mem:.2f} MB to {end_mem:.2f} MB ({100 * (start_mem - end_mem) / start_mem:.1f}% reduction)")
    return df

# ==============================================================================
# 1. Base Feature Aggregation (FİYAT HAREKETİ EKLENDİ)
# ==============================================================================

def add_base_features(df):
    """Adds basic statistical summaries and price movement features for each mint_token_id."""
    
    # Exclude 'is_target', 'fee', and 'timestamp' (if numeric)
    numeric_cols = df.select_dtypes(include=np.number).columns.drop(['is_target', 'fee'], errors='ignore')
    if 'timestamp' in numeric_cols:
        numeric_cols = numeric_cols.drop('timestamp')
            
    agg_funcs = ['sum', 'mean', 'max', 'min', 'std', 'median', 'first', 'last']
    agg_dict = {}
    
    for col in numeric_cols:
        if col in ['consumed_gas', 'buy_count', 'sell_count']:
            agg_dict[col] = ['sum', 'mean', 'max']
        else:
            agg_dict[col] = agg_funcs
            
    # YENİ EKLENTİ: timestamp'in ilk ve son değerini al (yaşam süresini hesaplamak için)
    agg_dict['timestamp'] = ['min', 'max'] 

    print("Establishing Basic Aggregation Properties...")
    
    df_agg = df.groupby(GROUP_KEY).agg(agg_dict)
    
    # Sütun adlarını düzeltme
    df_agg.columns = [f'{i}__{j}' for i, j in df_agg.columns]
    df_agg = df_agg.reset_index()

    # Fiyat Hareketi (price_usd sütununun mevcut olduğu varsayılır)
    if 'price_usd__last' in df_agg.columns and 'price_usd__first' in df_agg.columns:
        df_agg['price_usd__last_to_first'] = (df_agg['price_usd__last'] - df_agg['price_usd__first']) / (df_agg['price_usd__first'] + 1e-6)
        df_agg['price_usd__max_to_min_ratio'] = df_agg['price_usd__max'] / (df_agg['price_usd__min'] + 1e-6)
        df_agg['price_usd__last_to_first'] = df_agg['price_usd__last_to_first'].astype(np.float32)
        df_agg['price_usd__max_to_min_ratio'] = df_agg['price_usd__max_to_min_ratio'].astype(np.float32)
    
    # Sadece yeni oluşturulan sütunları alma (Fiyat hareketini de içerir)
    new_features = df_agg.columns.drop(GROUP_KEY).tolist()
    
    # Gerekli non-numeric sütunlar listesi
    # Buraya 'is_target' sütununu da dahil etmeliyiz.
    cols_to_merge = [col for col in df.columns if col not in numeric_cols.tolist()]
    
    # Eğer is_target listede yoksa (ki olmaması lazım), elle ekle
    if 'is_target' not in cols_to_merge:
        cols_to_merge.append('is_target')
        
    # Sadece token başına tek satır alıyoruz (keep='last' ile güncel durum)
    df_master_unique = df.drop_duplicates(subset=[GROUP_KEY], keep='last')[cols_to_merge]
    df_agg = df_agg.merge(df_master_unique, on=GROUP_KEY, how='left')

    return df_agg, new_features

# ==============================================================================
# 2. Windowing and Derived Features (BELLEK DOSTU VE HATA GİDERİLMİŞ)
# Rolling window kısmı **TAMAMEN KALDIRILMIŞTIR**!
# ==============================================================================

def add_time_and_rate_features(df_agg):
    """Adds time-based and derived rate/speed features (No Rolling Window)."""
    
    print("Creating Time-Based and Rates Features...")
    
    # Hata Düzeltme: Datetime kontrolünü uyumlu hale getir
    if 'timestamp__min' not in df_agg.columns or 'timestamp__max' not in df_agg.columns:
        raise ValueError("Error: 'timestamp__min' and 'timestamp__max' must be generated in add_base_features.")
        
    post_agg_features = []
    
    # 1. Token'ın Yaşam Süresi (Saniye Cinsinden)
    # df_agg['timestamp__max'] zaten en son işlem zamanı (keep='last' merge'den).
    # timestamp__min'i add_base_features'da oluşturduğumuzdan emin olmalıyız.
    
    df_agg['time_diff_sec'] = (df_agg['timestamp__max'] - df_agg['timestamp__min']).dt.total_seconds().astype(np.float32)
    
    # Minimum yaşam süresini 1 saniye olarak ayarla
    df_agg['time_diff_sec'] = df_agg['time_diff_sec'].apply(lambda x: x if x > 0 else 1.0).astype(np.float32)
    post_agg_features.append('time_diff_sec')
    
    # 2. İşlem Hızları (Rates)
    
    # Toplam işlem sayısı / Yaşam Süresi
    df_agg['total_count__per_sec'] = (df_agg['total_count__sum'] / df_agg['time_diff_sec']).astype(np.float32)
    
    # SOL hacmi / Yaşam Süresi
    df_agg['sol_volume__per_sec'] = (df_agg['sol_volume__sum'] / df_agg['time_diff_sec']).astype(np.float32)
    
    # İşlem Başına Hacim ve Alış Oranı (Mevcut kodunuzdan)
    df_agg['sol_volume__per_tx'] = (df_agg['sol_volume__sum'] / (df_agg['total_count__sum'] + 1e-6)).astype(np.float32)
    df_agg['buy_ratio'] = (df_agg['buy_count__sum'] / (df_agg['total_count__sum'] + 1e-6)).astype(np.float32)

    post_agg_features.extend(['total_count__per_sec', 'sol_volume__per_sec', 'sol_volume__per_tx', 'buy_ratio'])

    # 3. Logaritmik Dönüşümler (Mevcut kodunuzdan)
    for col in ['market_cap_usd__max', 'sol_volume__sum', 'token_volume__sum', 'total_count__sum']:
        if col in df_agg.columns:
            new_col = f'log1p_{col}'
            df_agg[new_col] = np.log1p(df_agg[col]).astype(np.float32)
            post_agg_features.append(new_col)
            
    return df_agg, post_agg_features

# ==============================================================================
# 2b. Categorical Feature Engineering (YENİ EKLENDİ)
# ==============================================================================

def add_frequency_features(df_agg):
    """Adds Frequency Encoding for categorical columns."""
    
    categorical_cols = df_agg.select_dtypes(include=['object', 'category']).columns.tolist()
    # timestamp, is_target, GROUP_KEY hariç
    cols_to_encode = [col for col in categorical_cols if col not in [GROUP_KEY, 'is_target', 'timestamp']]

    print("Creating Frequency Encoding Features...")
    new_freq_features = []
    
    for col in tqdm(cols_to_encode, desc="Frequency Encode"):
        new_col = f'{col}__freq'
        # Frekans haritasını oluştur (global frekans)
        freq_map = df_agg[col].value_counts(normalize=True).to_dict()
        df_agg[new_col] = df_agg[col].map(freq_map).astype(np.float32)
        new_freq_features.append(new_col)
        
    return df_agg, new_freq_features

# ==============================================================================
# 3. Cleaning and Transformation (Aynı kalır)
# ==============================================================================
def clean_and_transform(df):
    """Trims outliers (winsorize) and handles categorical/boolean data."""
    
    # Booleanları int8'e çevir
    for col in df.select_dtypes(include='bool').columns:
        df[col] = df[col].astype(np.int8)

    # Aykırı değerleri kırpma (Winsorization)
    numeric_cols = df.select_dtypes(include=np.number).columns.drop([
        'is_target', 'buy_count', 'sell_count', 'total_count', 'consumed_gas', 'fee'
    ], errors='ignore')
    
    print("Outliers are Truncated (Winsorization)...")
    for col in tqdm(numeric_cols, desc="Winsorize"):
        # NaN değerlerini atlamak için maske kullan
        mask = df[col].notna()
        df.loc[mask, col] = winsorize(df.loc[mask, col], limits=[0.01, 0.01])
        df[col] = df[col].astype(np.float32) # Winsorize float'a çevirir, float32'ye zorla

    return df

# ==============================================================================
# 4. CatBoost Training (Aynı kalır)
# ==============================================================================
def train_and_predict_catboost(df_master, df_test, feature_cols, target_col='is_target'):
    """CatBoost trains the model and returns OOF/Test predictions."""
    
    N_SPLITS = 5 
    RANDOM_SEED = 42
    
    X = df_master[feature_cols]
    y = df_master[target_col].astype(int)
    X_test = df_test[feature_cols]

    oof_preds = np.zeros(X.shape[0])
    test_preds = np.zeros(X_test.shape[0])
    
    # Categorical columns detection
    categorical_cols = [col for col in feature_cols if X[col].dtype.name in ['object', 'category']]
    categorical_features_indices = [i for i, col in enumerate(feature_cols) if col in categorical_cols]

    skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

    print("\nCatBoost Training Begins (5-Fold Stratified CV)...")
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"--- Fold {fold+1}/{N_SPLITS} ---")
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model = CatBoostClassifier(
            iterations=1000,
            learning_rate=0.03,
            depth=6,
            loss_function='Logloss',
            eval_metric='AUC',
            random_seed=RANDOM_SEED,
            l2_leaf_reg=3,
            border_count=64, 
            verbose=100,
            early_stopping_rounds=100,
            task_type='GPU'
        )        
        
        model.fit(
            X_train, y_train,
            eval_set=(X_val, y_val),
            cat_features=categorical_features_indices,
            use_best_model=True
        )

        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / N_SPLITS
        
    return oof_preds, test_preds, model 

# ==============================================================================
# 5. Jaccard Optimization Function (Aynı kalır)
# ==============================================================================
def optimize_threshold(y_true, y_proba, recall_min=0.75):
    "Finds the best Jaccard threshold that satisfies the Minimum Recall condition."
    best_jaccard = 0
    best_thresh = 0.5
    
    for thresh in np.arange(0.01, 0.99, 0.01):
        y_pred = (y_proba > thresh).astype(int)
        
        #Calculate Jaccard only in positive class
        jaccard = jaccard_score(y_true, y_pred, average='binary')
        recall = recall_score(y_true, y_pred, average='binary')
        
        # Find the best Jaccard that meets the Minimum Recall requirement
        if recall >= recall_min and jaccard > best_jaccard:
            best_jaccard = jaccard
            best_thresh = thresh
            
    print(f"\nOptimal Threshold (Recall > {recall_min}): {best_thresh:.4f} (Jaccard: {best_jaccard:.4f})")
    return best_thresh 

# ==============================================================================
# MAIN EXECUTION BLOCK
# ==============================================================================

# 1. Prepare and Combine DataFrames
print("--- 1. Preparing and Combining DataFrames ---")
df_test['is_target'] = np.nan
df_full = pd.concat([df_master, df_test], ignore_index=True)
print(f"Full DataFrame size: {df_full.shape}")

# BELLEK OPTİMİZASYONU: Downcasting'i Feature Engineering'den hemen önce yap
df_full = downcast_dtypes(df_full)

# ZAMAN DAMGASI DÖNÜŞÜMÜ VE STANDARDIZASYONU (Önceki hatayı çözmek için)
print("Standardizing 'timestamp'...")
df_full['timestamp'] = df_full['timestamp'].astype(str) # String'e zorla
df_full['timestamp'] = pd.to_datetime(df_full['timestamp'], errors='coerce') # Dönüştür

try:
    if df_full['timestamp'].dt.tz is not None:
        df_full['timestamp'] = df_full['timestamp'].dt.tz_localize(None) # Tz bilgisini kaldır
except AttributeError:
    pass

# 2. Feature Engineering
print("\n--- 2. Feature Engineering Pipeline ---")

# a. Base Aggregation Features (TOKEN BAŞINA 1 SATIR OLUŞTURUR)
df_full_agg, base_features = add_base_features(df_full.copy())

# b. Frequency Encoding
df_full_agg, freq_features = add_frequency_features(df_full_agg)

# c. Time-Based and Post-Aggregation Features (NO ROLLING WINDOW)
# df_full_agg'de 'timestamp__min' ve 'timestamp__max' kullanır.
df_full_agg, post_features = add_time_and_rate_features(df_full_agg)

# d. Cleaning and Winsorization
df_full_clean = clean_and_transform(df_full_agg.copy())

# 3. Separate Train and Test Sets
print("\n--- 3. Separating Train and Test Sets ---")
df_master_clean = df_full_clean[~df_full_clean['is_target'].isna()].reset_index(drop=True)
df_test_clean = df_full_clean[df_full_clean['is_target'].isna()].reset_index(drop=True)

# 4. Define Feature Columns
all_new_features = base_features + freq_features + post_features

# 'timestamp' artık tekil satırda sadece en son değeri temsil ettiğinden, feature olarak atılabilir.
feature_cols = [col for col in df_master_clean.columns 
                if col in all_new_features and col not in [
                    GROUP_KEY, 'is_target', 'timestamp', 
                    'timestamp__min', 'timestamp__max', 'trade_mode', 'holder', 'creator'
                ]]

print(f"\nTotal Features for Training: {len(feature_cols)}")

# 5. Model Training and Prediction
oof_preds, test_preds, final_model = train_and_predict_catboost(
    df_master_clean, 
    df_test_clean, 
    feature_cols
)

# Adding Results to Test DataFrame
df_test_clean['prediction_proba'] = test_preds

# --- 6. JACCARD Optimization and Threshold Determination ---
print("\n--- 6. Jaccard Optimization and Final Threshold ---")

# Find the optimal threshold on OOF estimates
optimal_threshold = optimize_threshold(df_master_clean['is_target'], oof_preds)

# Quantile Threshold Based on Target Rate (Considers Imbalance)
target_ratio = df_master_clean['is_target'].mean()
quantile_threshold = np.quantile(oof_preds, 1 - target_ratio) 
print(f"Quantile Threshold Based on Target Rate: {quantile_threshold:.4f}")

# Final Threshold: Making a more conservative estimate by taking the minimum of the two thresholds
final_threshold = min(optimal_threshold, quantile_threshold)
print(f"Final Prediction Threshold: {final_threshold:.4f}")

# --- 7. Final Prediction and Submission ---

df_test_clean['is_target_predicted'] = (df_test_clean['prediction_proba'] > final_threshold).astype(int)
submission_df = df_test_clean[[GROUP_KEY, 'is_target_predicted']]
submission_df = submission_df.rename(columns={'is_target_predicted': 'is_target'})

submission_df.to_csv('submission.csv', index=False)
print("\n✅ 'submission.csv' file created successfully.")

--- 1. Preparing and Combining DataFrames ---
Full DataFrame size: (10894291, 34)
Starting memory optimization...


Downcasting:   0%|          | 0/34 [00:00<?, ?it/s]

Memory decreased from 7459.73 MB to 1589.28 MB (78.7% reduction)
Standardizing 'timestamp'...

--- 2. Feature Engineering Pipeline ---
Establishing Basic Aggregation Properties...


  df_agg = df.groupby(GROUP_KEY).agg(agg_dict)


Creating Frequency Encoding Features...


Frequency Encode:   0%|          | 0/3 [00:00<?, ?it/s]

Creating Time-Based and Rates Features...
Outliers are Truncated (Winsorization)...


Winsorize:   0%|          | 0/206 [00:00<?, ?it/s]


--- 3. Separating Train and Test Sets ---

Total Features for Training: 205

CatBoost Training Begins (5-Fold Stratified CV)...
--- Fold 1/5 ---


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8250050	best: 0.8250050 (0)	total: 555ms	remaining: 9m 14s
100:	test: 0.8784342	best: 0.8784342 (100)	total: 1.84s	remaining: 16.4s
200:	test: 0.8853037	best: 0.8853037 (200)	total: 3.03s	remaining: 12s
300:	test: 0.8885681	best: 0.8885681 (300)	total: 4.21s	remaining: 9.79s
400:	test: 0.8901360	best: 0.8901360 (400)	total: 5.4s	remaining: 8.07s
500:	test: 0.8913718	best: 0.8913754 (499)	total: 6.59s	remaining: 6.56s
600:	test: 0.8923409	best: 0.8923409 (600)	total: 7.78s	remaining: 5.16s
700:	test: 0.8932343	best: 0.8932343 (700)	total: 8.85s	remaining: 3.78s
800:	test: 0.8937609	best: 0.8937609 (800)	total: 9.93s	remaining: 2.47s
900:	test: 0.8945205	best: 0.8945205 (900)	total: 11s	remaining: 1.21s
999:	test: 0.8952876	best: 0.8952876 (999)	total: 12.1s	remaining: 0us
bestTest = 0.8952876031
bestIteration = 999
--- Fold 2/5 ---


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8186315	best: 0.8186315 (0)	total: 21ms	remaining: 21s
100:	test: 0.8712338	best: 0.8712338 (100)	total: 1.31s	remaining: 11.7s
200:	test: 0.8800588	best: 0.8800588 (200)	total: 2.43s	remaining: 9.66s
300:	test: 0.8833047	best: 0.8833047 (300)	total: 3.65s	remaining: 8.48s
400:	test: 0.8851784	best: 0.8851784 (400)	total: 4.88s	remaining: 7.29s
500:	test: 0.8865805	best: 0.8865805 (500)	total: 6s	remaining: 5.98s
600:	test: 0.8877379	best: 0.8877379 (600)	total: 7.11s	remaining: 4.72s
700:	test: 0.8884699	best: 0.8884699 (700)	total: 8.23s	remaining: 3.51s
800:	test: 0.8892524	best: 0.8892524 (800)	total: 9.35s	remaining: 2.32s
900:	test: 0.8898055	best: 0.8898055 (900)	total: 10.5s	remaining: 1.15s
999:	test: 0.8903516	best: 0.8903537 (998)	total: 11.6s	remaining: 0us
bestTest = 0.8903536797
bestIteration = 998
Shrink model to first 999 iterations.
--- Fold 3/5 ---


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8245605	best: 0.8245605 (0)	total: 20.8ms	remaining: 20.8s
100:	test: 0.8796856	best: 0.8796856 (100)	total: 1.29s	remaining: 11.5s
200:	test: 0.8877431	best: 0.8877431 (200)	total: 2.43s	remaining: 9.67s
300:	test: 0.8909135	best: 0.8909135 (300)	total: 3.58s	remaining: 8.32s
400:	test: 0.8927854	best: 0.8927854 (400)	total: 4.66s	remaining: 6.96s
500:	test: 0.8940177	best: 0.8940177 (500)	total: 5.78s	remaining: 5.75s
600:	test: 0.8950681	best: 0.8950681 (600)	total: 6.89s	remaining: 4.58s
700:	test: 0.8959619	best: 0.8959619 (700)	total: 8.01s	remaining: 3.42s
800:	test: 0.8965975	best: 0.8965975 (800)	total: 9.18s	remaining: 2.28s
900:	test: 0.8972948	best: 0.8972948 (900)	total: 10.3s	remaining: 1.14s
999:	test: 0.8978233	best: 0.8978271 (998)	total: 11.5s	remaining: 0us
bestTest = 0.8978270888
bestIteration = 998
Shrink model to first 999 iterations.
--- Fold 4/5 ---


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8268153	best: 0.8268153 (0)	total: 21.4ms	remaining: 21.4s
100:	test: 0.8789154	best: 0.8789154 (100)	total: 1.3s	remaining: 11.6s
200:	test: 0.8859886	best: 0.8859886 (200)	total: 2.45s	remaining: 9.74s
300:	test: 0.8891370	best: 0.8891370 (300)	total: 3.6s	remaining: 8.35s
400:	test: 0.8907958	best: 0.8907958 (400)	total: 4.76s	remaining: 7.11s
500:	test: 0.8920306	best: 0.8920318 (499)	total: 5.89s	remaining: 5.87s
600:	test: 0.8927861	best: 0.8927874 (597)	total: 7.02s	remaining: 4.66s
700:	test: 0.8935653	best: 0.8935653 (700)	total: 8.15s	remaining: 3.48s
800:	test: 0.8942979	best: 0.8943014 (799)	total: 9.28s	remaining: 2.31s
900:	test: 0.8947037	best: 0.8947037 (900)	total: 10.4s	remaining: 1.15s
999:	test: 0.8953074	best: 0.8953074 (999)	total: 11.5s	remaining: 0us
bestTest = 0.8953074217
bestIteration = 999
--- Fold 5/5 ---


Default metric period is 5 because AUC is/are not implemented for GPU


0:	test: 0.8222705	best: 0.8222705 (0)	total: 20.6ms	remaining: 20.6s
100:	test: 0.8720842	best: 0.8720842 (100)	total: 1.26s	remaining: 11.2s
200:	test: 0.8806419	best: 0.8806419 (200)	total: 2.36s	remaining: 9.38s
300:	test: 0.8840509	best: 0.8840509 (300)	total: 3.45s	remaining: 8.01s
400:	test: 0.8858316	best: 0.8858316 (400)	total: 4.53s	remaining: 6.76s
500:	test: 0.8871842	best: 0.8871842 (500)	total: 5.64s	remaining: 5.62s
600:	test: 0.8884124	best: 0.8884124 (600)	total: 6.79s	remaining: 4.51s
700:	test: 0.8894350	best: 0.8894350 (700)	total: 7.91s	remaining: 3.38s
800:	test: 0.8901449	best: 0.8901449 (800)	total: 9.05s	remaining: 2.25s
900:	test: 0.8908071	best: 0.8908071 (900)	total: 10.2s	remaining: 1.12s
999:	test: 0.8914367	best: 0.8914367 (999)	total: 11.3s	remaining: 0us
bestTest = 0.8914366961
bestIteration = 999

--- 6. Jaccard Optimization and Final Threshold ---

Optimal Threshold (Recall > 0.75): 0.0200 (Jaccard: 0.0801)
Quantile Threshold Based on Target Rate: 0.1

In [13]:
df_report = df_test_clean[[GROUP_KEY, 'prediction_proba', 'is_target_predicted']].copy()

df_report.rename(columns={
    GROUP_KEY: 'token',
    'prediction_proba': 'prediction value',
    'is_target_predicted': 'isTargetToken'
}, inplace=True)

df_report['threshold'] = final_threshold

df_report = df_report[['token', 'threshold', 'prediction value', 'isTargetToken']]
df_report.to_csv('detailed_report.csv', index=False)

print("\n✅ 'detailed_report.csv' (Extended Submission) file created.")



✅ 'detailed_report.csv' (Extended Submission) file created.
