In [3]:
import pandas as pd

In [5]:
df  = pd.read_csv("ibm_transactions.csv")
df.head()

Unnamed: 0,Timestamp,From Bank,Account,To Bank,Account.1,Amount Received,Receiving Currency,Amount Paid,Payment Currency,Payment Format,Is Laundering
0,2022/09/01 00:20,10,8000EBD30,10,8000EBD30,3697.34,US Dollar,3697.34,US Dollar,Reinvestment,0
1,2022/09/01 00:20,3208,8000F4580,1,8000F5340,0.01,US Dollar,0.01,US Dollar,Cheque,0
2,2022/09/01 00:00,3209,8000F4670,3209,8000F4670,14675.57,US Dollar,14675.57,US Dollar,Reinvestment,0
3,2022/09/01 00:02,12,8000F5030,12,8000F5030,2806.97,US Dollar,2806.97,US Dollar,Reinvestment,0
4,2022/09/01 00:06,10,8000F5200,10,8000F5200,36682.97,US Dollar,36682.97,US Dollar,Reinvestment,0


In [6]:
counts = df['Is Laundering'].value_counts()

print(counts)

print("Not Laundering: ", counts.get(0,0))
print("Laundering:", counts.get(1,0))

Is Laundering
0    5073168
1       5177
Name: count, dtype: int64
Not Laundering:  5073168
Laundering: 5177


In [7]:
def sample_laundering_df(df, total_rows, laundering_ratio):
    """
    Returns a new DataFrame with `total_rows` rows and the specified proportion of laundering.
    
    Parameters:
        df (pd.DataFrame): Original DataFrame with 'Is Laundering' column.
        total_rows (int): Total number of rows in the returned DataFrame.
        laundering_ratio (float): Proportion of rows that should be laundering (between 0 and 1).
        
    Returns:
        pd.DataFrame: Sampled DataFrame with desired properties.
    """
    if 'Is Laundering' not in df.columns:
        raise ValueError("'Is Laundering' column not found in the DataFrame.")

    # Strip column names just in case
    df.columns = df.columns.str.strip()

    # Calculate counts
    num_laundering = int(total_rows * laundering_ratio)
    num_not_laundering = total_rows - num_laundering

    laundering_df = df[df['Is Laundering'] == 1]
    not_laundering_df = df[df['Is Laundering'] == 0]

    if len(laundering_df) < num_laundering:
        raise ValueError(f"Not enough laundering rows available. Requested {num_laundering}, but only {len(laundering_df)} available.")
    if len(not_laundering_df) < num_not_laundering:
        raise ValueError(f"Not enough non-laundering rows available. Requested {num_not_laundering}, but only {len(not_laundering_df)} available.")

    sampled_laundering = laundering_df.sample(n=num_laundering, random_state=42)
    sampled_not_laundering = not_laundering_df.sample(n=num_not_laundering, random_state=42)

    new_df = pd.concat([sampled_laundering, sampled_not_laundering]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    return new_df

In [13]:
subset_df = sample_laundering_df(df, total_rows=10, laundering_ratio=0.2)
print(subset_df['Is Laundering'].value_counts())

subset_df.to_csv("./10_transactions.csv")

Is Laundering
0    8
1    2
Name: count, dtype: int64
