In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import kagglehub
import shutil
import os
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [None]:
# Download latest version
path = kagglehub.dataset_download("kartik2112/fraud-detection")

print("Path to dataset files:", path)

In [None]:
# Define the destination folder path
file_path = '/home/guilherme-coelho/Documentos/tcc/data/raw'

# Ensure the destination folder exists
os.makedirs(file_path, exist_ok=True)

# Copy files from the Kaggle dataset path to the local folder
shutil.copytree(path, file_path, dirs_exist_ok=True)

print(f"Dataset copied to {file_path}")

In [None]:
import pandas as pd

# Load the dataset into a DataFrame
dataset_path = os.path.join(file_path, 'fraudTrain.csv') 
df = pd.read_csv(dataset_path)

print(df.head())  # Display the first few rows of the DataFrame

In [None]:
# Check for missing values in the training dataset
print("\nMissing values in the training dataset:")
print(df.isnull().sum())

# Summary statistics for numeric columns
print("\nSummary statistics for numeric columns in the training dataset:")
print(df.describe())

# Distribution of the target variable ('is_fraud') in the training dataset
plt.figure(figsize=(6, 4))
sns.countplot(x='is_fraud', data=df)
plt.title('Distribution of Fraudulent Transactions (Training Dataset)')
plt.xlabel('Is Fraud (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()

In [None]:
# List of columns to drop
columns_to_drop = ['Unnamed: 0', 'cc_num', 'trans_date_trans_time', 'first', 'last', 'dob', 'street', 'trans_num', 'unix_time']

# Drop the columns from the DataFrame
df = df.drop(columns=columns_to_drop)

print(df.head())  # Display the first few rows of the updated DataFrame

In [None]:
print(f"👉 Data Types: {df.dtypes}\n")
df.head()

In [None]:
# Columns with categorical values
df.select_dtypes(include = ['object'])


In [None]:
df.select_dtypes(include = ['object'])

encoder = LabelEncoder()
df["merchant"] = encoder.fit_transform(df["merchant"])
df["category"] = encoder.fit_transform(df["category"])
df["gender"] = encoder.fit_transform(df["gender"])
df["job"] = encoder.fit_transform(df["job"])
df["city"] = encoder.fit_transform(df["city"])
df["state"] = encoder.fit_transform(df["state"])

df.head()


In [None]:
# Separate features (X) and target variable (y)
X = df.drop(columns=['is_fraud'])
y = df['is_fraud']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Combine the resampled features and target into a new DataFrame
df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=['is_fraud'])], axis=1)

# Display the class distribution after oversampling
print("Class distribution after SMOTE:", Counter(df_resampled['is_fraud']))



In [None]:
# Define the output file path
output_file_path = '/home/guilherme-coelho/Documentos/tcc/data/processed/filtered_balanced_dataset.csv'

# Ensure the destination folder exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Write the filtered and balanced dataset to a CSV file
df_resampled.to_csv(output_file_path, index=False)

print(f"Filtered and balanced dataset written to {output_file_path}")

In [None]:
# Show the distribution of the target variable in the resampled dataset
plt.figure(figsize=(6, 4))
sns.countplot(x='is_fraud', data=df_resampled)
plt.title('Distribution of Fraudulent Transactions (Training Dataset)')
plt.xlabel('Is Fraud (0 = No, 1 = Yes)')
plt.ylabel('Count')
plt.show()