In [5]:
# Fraud Detection Analysis - Understanding Dataset and Fraud Rules

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('transactions.csv', parse_dates=['TX_DATETIME'])

# Preview data
print("\nHead of dataset:")
print(df.head())
print("\nDataset info:")
print(df.info())

# Summary statistics
print("\nSummary statistics:")
print(df.describe())

# Fraud ratio
fraud_ratio = df['TX_FRAUD'].mean()
print(f"\nFraud ratio: {fraud_ratio:.4f}")

# Count of frauds
print("\nFraud counts:")
print(df['TX_FRAUD'].value_counts())

# Rule 1: High transaction amount frauds
print("\nRule 1: Transactions with amount > 220")
df['is_high_amount_fraud'] = df['TX_AMOUNT'] > 220
high_amount_frauds = df[df['is_high_amount_fraud'] == True]
print(f"Number of high-amount transactions: {high_amount_frauds.shape[0]}")

# Rule 2: Terminal-level fraud pattern
print("\nRule 2: Terminal-level fraud counts")
fraud_terminals = df[df['TX_FRAUD'] == 1]['TERMINAL_ID'].value_counts()
print(fraud_terminals.head())

# Rule 3: Customer-level inflated frauds
print("\nRule 3: Inflated customer transaction frauds")
customer_avg = df.groupby('CUSTOMER_ID')['TX_AMOUNT'].mean().rename('AVG_TX_AMOUNT')
df = df.merge(customer_avg, on='CUSTOMER_ID')
df['is_inflated'] = df['TX_AMOUNT'] > (df['AVG_TX_AMOUNT'] * 4.5)
inflated_frauds = df[df['is_inflated'] & (df['TX_FRAUD'] == 1)]
print(f"Number of suspected inflated frauds: {inflated_frauds.shape[0]}")
print(inflated_frauds.head())

# Visualization: Transaction amount distribution
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='TX_AMOUNT', hue='TX_FRAUD', bins=50, log_scale=(True, False))
plt.title('Transaction Amount Distribution by Fraud Label')
plt.xlabel('Transaction Amount')
plt.ylabel('Count')
plt.legend(title='Fraud')
plt.tight_layout()
plt.show()


FileNotFoundError: [Errno 2] No such file or directory: 'transactions.csv'

In [4]:
import os
import pandas as pd

# Set your data folder path correctly (use raw string or double backslashes)
data_folder_path = r'D:\COLLEGE\intership unifid\fraud_detection\dataset\data'

# List and sort all .pkl files
data_files = sorted([f for f in os.listdir(data_folder_path) if f.endswith('.pkl')])

# Load and combine all .pkl files into a single DataFrame
all_dataframes = []
for file in data_files:
    file_path = os.path.join(data_folder_path, file)
    df_day = pd.read_pickle(file_path)
    all_dataframes.append(df_day)

# Combine into one DataFrame
full_df = pd.concat(all_dataframes, ignore_index=True)

# Save to CSV
output_path = os.path.join(data_folder_path, 'transactions.csv')
full_df.to_csv(output_path, index=False)

print(f"CSV file saved to: {output_path}")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754155 entries, 0 to 1754154
Data columns (total 9 columns):
 #   Column             Dtype         
---  ------             -----         
 0   TRANSACTION_ID     int64         
 1   TX_DATETIME        datetime64[ns]
 2   CUSTOMER_ID        object        
 3   TERMINAL_ID        object        
 4   TX_AMOUNT          float64       
 5   TX_TIME_SECONDS    object        
 6   TX_TIME_DAYS       object        
 7   TX_FRAUD           int64         
 8   TX_FRAUD_SCENARIO  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(4)
memory usage: 120.4+ MB
None
   TRANSACTION_ID         TX_DATETIME CUSTOMER_ID TERMINAL_ID  TX_AMOUNT  \
0               0 2018-04-01 00:00:31         596        3156      57.16   
1               1 2018-04-01 00:02:10        4961        3412      81.51   
2               2 2018-04-01 00:07:56           2        1365     146.00   
3               3 2018-04-01 00:09:29        4128        8737      