## Load training transaction data

In [1]:
import pandas as pd

# Loading the data
df = pd.read_csv("data/train_transaction.csv")
print("Number of transactions in the training dataset : ", len(df))


Number of transactions in the training dataset :  590540


## Check uniqueness of (card2, card3) per card1

In [9]:
# Count how many distinct (card2, card3) combinations exist for each card1
combo_counts = (
    df.groupby("card1")[["card2", "card3"]]
      .nunique()
)

# Check if card2 and card3 are always single-valued per card1
violations = combo_counts[(combo_counts["card2"] > 1) | (combo_counts["card3"] > 1)]

if violations.empty:
    print("Each card1 value corresponds to a unique combination of (card2, card3).")
else:
    print("Some card1 values map to multiple card2/card3 combinations:")
    print(violations)

print("Max of number of card2 per card1 : ", violations["card2"].max())
print("Max of number of card3 per card1 : ", violations["card3"].max())

Some card1 values map to multiple card2/card3 combinations:
       card2  card3
card1              
1208       2      1
1214       2      1
1302       2      1
1341       1      2
1363       2      1
...      ...    ...
18054      2      1
18123      2      1
18270      2      1
18375      2      1
18376      2      1

[265 rows x 2 columns]
Max of number of card2 per card1 :  3
Max of number of card3 per card1 :  2


## Detect card1 with multiple transactions at the same time

In [16]:
# Group by card1 and TransactionDT and count distinct TransactionIDs
counts_df = (
    df.groupby(["card1", "TransactionDT"])['TransactionID']
      .nunique()
      .to_frame('count')
)

# Keep only duplicates: distinct TransactionIDs > 1
duplicates_same_second = counts_df[counts["count"] > 1]

if duplicates_same_second.empty:
    print("No card1 has more than one distinct TransactionID in the same second.")
else:
    print("Some card1 values have multiple distinct TransactionIDs within the same second:")
    print(duplicates_same_second.sort_values("count", ascending=False))


Some card1 values have multiple distinct TransactionIDs within the same second:
                     count
card1 TransactionDT       
9288  9474817            8
13780 7236588            5
10023 8218708            4
      8218707            4
2744  8468062            4
...                    ...
17188 11305705           2
      11832962           2
      14525480           2
17335 7414661            2
17517 8739176            2

[146 rows x 1 columns]


## Detect card1 transactions repeated within 1 minute

In [17]:
# Sort by card1 and TransactionDT
df = df.sort_values(["card1", "TransactionDT"])

# First, remove duplicate TransactionIDs to avoid comparing the same transaction
df = df.drop_duplicates(subset=["card1", "TransactionID"])

# Compute difference between consecutive transactions for each card1
df["diff_sec"] = df.groupby("card1")["TransactionDT"].diff()

# Mark transactions repeated within 60 seconds
df["repeated_within_1min"] = df["diff_sec"] <= 60

# Extract only the repeated occurrences
repeated = df[df["repeated_within_1min"] == True]

print("Number of repeated card1 events within 1 minute:", len(repeated))
print(repeated[["card1", "TransactionDT", "diff_sec"]].set_index("card1").sort_values("diff_sec"))


Number of repeated card1 events within 1 minute: 19828
       TransactionDT  diff_sec
card1                         
15497       15187923       0.0
7919         7310775       0.0
15548        8695812       0.0
3570         7429783       0.0
7919         7937752       0.0
...              ...       ...
7919        13174656      60.0
12260        1902332      60.0
2631         2038734      60.0
15582        3020192      60.0
10486        6386533      60.0

[19828 rows x 2 columns]
