# Load the datasets

 This notebook splits the `test.csv` and `test_y.csv` into 6 dataframes, namely 
 
 1. train_df
 2. val_df
 3. test_df 


and their corresponding 3 labels which is 

4. train_y_df
5. val_y_df
6. test_y_df.

In [1]:
import pandas as pd

# Read CSV file, test.csv is generated by the SAINT repo's training notebook
df = pd.read_csv("../2_dataset/test.csv")

# Calculate sizes
total_samples = len(df)
train_size = int(0.65 * total_samples)  # 65%
val_size = int(0.15 * total_samples)    # 15%
test_size = total_samples - (train_size + val_size)  # Remaining 20%

# Perform slicing
train_df = df.iloc[:train_size]
val_df = df.iloc[train_size:train_size + val_size]
test_df = df.iloc[train_size + val_size:]

# Print sizes
print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

# Verify split correctness
print("Train DataFrame:\n", train_df.head())
print("Validation DataFrame:\n", val_df.head())
print("Test DataFrame:\n", test_df.head())


Train size: 76770
Validation size: 17716
Test size: 23622
Train DataFrame:
    cls  ProductCD  card4  card6  P_emaildomain  Unnamed: 0  TransactionDT  \
0    0          4      2      1              2   -0.291883      -0.329939   
1    0          4      3      2             16    0.892993       0.871243   
2    0          4      2      2              1   -1.594876      -1.467121   
3    0          4      3      2             19   -0.123148      -0.156138   
4    0          4      3      2             16    1.611964       1.677853   

   TransactionAmt     card1     card2  ...      V312      V313      V314  \
0        0.108390 -0.145421 -0.399322  ... -0.227583 -0.222385 -0.249222   
1       -0.359702  0.680504 -0.412094  ... -0.030054 -0.222385 -0.249222   
2        8.134522 -0.109308  0.711822  ... -0.227583 -0.222385 -0.249222   
3       -0.422421  1.487250 -0.265218  ...  0.341765 -0.222385 -0.249222   
4       -0.317889 -0.081355 -0.265218  ... -0.227583 -0.222385 -0.249222   

    

In [2]:
# Saving the divided datasets

train_df.to_csv("../2_dataset/final/train_df.csv", index=False)
val_df.to_csv("../2_dataset/final/val_df.csv", index=False)
test_df.to_csv("../2_dataset/final/test_df.csv", index=False)


In [3]:
df


Unnamed: 0.1,cls,ProductCD,card4,card6,P_emaildomain,Unnamed: 0,TransactionDT,TransactionAmt,card1,card2,...,V312,V313,V314,V315,V316,V317,V318,V319,V320,V321
0,0,4,2,1,2,-0.291883,-0.329939,0.108390,-0.145421,-0.399322,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
1,0,4,3,2,16,0.892993,0.871243,-0.359702,0.680504,-0.412094,...,-0.030054,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
2,0,4,2,2,1,-1.594876,-1.467121,8.134522,-0.109308,0.711822,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
3,0,4,3,2,19,-0.123148,-0.156138,-0.422421,1.487250,-0.265218,...,0.341765,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
4,0,4,3,2,16,1.611964,1.677853,-0.317889,-0.081355,-0.265218,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118103,0,4,2,1,16,-1.457436,-1.355259,-0.364092,1.054292,-1.229487,...,0.004804,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
118104,0,0,2,2,16,0.297232,0.252414,-0.476668,-1.376148,0.290354,...,-0.070326,0.060149,-0.093159,0.002731,-0.020589,-0.046363,-0.035467,-0.055287,-0.088855,-0.074142
118105,0,4,3,2,53,0.146207,0.125764,0.555994,0.570530,0.813996,...,1.329410,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142
118106,0,4,3,2,23,0.531930,0.497868,-0.280467,-0.081355,-0.265218,...,-0.227583,-0.222385,-0.249222,-0.229148,-0.048377,-0.062211,-0.058049,-0.055287,-0.088855,-0.074142


In [4]:
import pandas as pd

# Read CSV file, test_y.csv is generated by the SAINT repo's training notebook
test_y_df = pd.read_csv("../2_dataset/test_y.csv")

# Calculate sizes
total_samples = len(test_y_df)
train_size = int(0.65 * total_samples)  # 65%
val_size = int(0.15 * total_samples)    # 15%
test_size = total_samples - (train_size + val_size)  # Remaining 20%

# Perform slicing
train_y_df = test_y_df.iloc[:train_size]
val_y_df = test_y_df.iloc[train_size:train_size + val_size]
test_y_df = test_y_df.iloc[train_size + val_size:]

# Print sizes
print(f"Train size: {len(train_y_df)}")
print(f"Validation size: {len(val_y_df)}")
print(f"Test size: {len(test_y_df)}")

# Verify split correctness
print("Train DataFrame:\n", train_y_df.head())
print("Validation DataFrame:\n", val_y_df.head())
print("Test DataFrame:\n", test_y_df.head())


Train size: 76770
Validation size: 17716
Test size: 23622
Train DataFrame:
    isFraud
0      0.0
1      0.0
2      0.0
3      0.0
4      0.0
Validation DataFrame:
        isFraud
76770      0.0
76771      0.0
76772      0.0
76773      0.0
76774      0.0
Test DataFrame:
        isFraud
94486      0.0
94487      0.0
94488      0.0
94489      0.0
94490      0.0


In [5]:
# Saving the divided datasets

train_y_df.to_csv("../2_dataset/final/train_y_df.csv", index=False)
val_y_df.to_csv("../2_dataset/final/val_y_df.csv", index=False)
test_y_df.to_csv("../2_dataset/final/test_y_df.csv", index=False)
