# Data Preperation Proccess

## Configuration

In [1]:
import pandas as pd
import numpy as np

# --- Configuration ---
# IMPORTANT: Adjust this path if your CSV file is in a different directory
DATA_PATH = './' # Assuming credit_card_fraud.csv is in the same directory as your notebook
FILE_NAME = 'credit_card_fraud.csv' # The specific file you are using

# --- 1. Data Preparation ---

print("--- Starting Data Preparation (for single file: credit_card_fraud.csv) ---")

# Load the dataset
try:
    df = pd.read_csv(f'{DATA_PATH}{FILE_NAME}')
    print(f"Dataset '{FILE_NAME}' loaded successfully.")
except FileNotFoundError as e:
    print(f"Error loading file: {e}. Please ensure '{FILE_NAME}' is in the '{DATA_PATH}' directory.")
    exit()


--- Starting Data Preparation (for single file: credit_card_fraud.csv) ---
Dataset 'credit_card_fraud.csv' loaded successfully.


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34636378 entries, 0 to 34636377
Data columns (total 27 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   ssn         object 
 2   cc_num      int64  
 3   first       object 
 4   last        object 
 5   gender      object 
 6   street      object 
 7   city        object 
 8   state       object 
 9   zip         int64  
 10  lat         float64
 11  long        float64
 12  city_pop    int64  
 13  job         object 
 14  dob         object 
 15  acct_num    int64  
 16  profile     object 
 17  trans_num   object 
 18  trans_date  object 
 19  trans_time  object 
 20  unix_time   int64  
 21  category    object 
 22  amt         float64
 23  is_fraud    int64  
 24  merchant    object 
 25  merch_lat   float64
 26  merch_long  float64
dtypes: float64(5), int64(7), object(15)
memory usage: 7.0+ GB


### Extracting only california relevant data

In [3]:
# Filter rows where State == "CA"
df_ca = df[df["state"] == "CA"]

# Save directly to a Pickle file
df_ca.to_pickle("credit_card_fraud_CA.pkl")

print("Filtered dataframe saved as 'credit_card_fraud_CA.pkl'")

Filtered dataframe saved as 'credit_card_fraud_CA.pkl'


### Saving to PKL (Once)

### Loading DF from PKL File

In [4]:
df = pd.read_pickle("credit_card_fraud_CA.pkl")
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4033120 entries, 0 to 34636377
Data columns (total 27 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   ssn         object 
 2   cc_num      int64  
 3   first       object 
 4   last        object 
 5   gender      object 
 6   street      object 
 7   city        object 
 8   state       object 
 9   zip         int64  
 10  lat         float64
 11  long        float64
 12  city_pop    int64  
 13  job         object 
 14  dob         object 
 15  acct_num    int64  
 16  profile     object 
 17  trans_num   object 
 18  trans_date  object 
 19  trans_time  object 
 20  unix_time   int64  
 21  category    object 
 22  amt         float64
 23  is_fraud    int64  
 24  merchant    object 
 25  merch_lat   float64
 26  merch_long  float64
dtypes: float64(5), int64(7), object(15)
memory usage: 861.6+ MB


## Information Gathering

In [5]:
# Display initial info
print("\n--- Initial Data Info ---")
print("DataFrame Head:\n", df.head())
print("\nDataFrame Info:")
df.info()
print("\nDataFrame Description (numerical features):\n", df.describe())


--- Initial Data Info ---
DataFrame Head:
    Unnamed: 0          ssn            cc_num   first    last gender  \
0           0  750-09-7342  6011237648640631  Amanda  Morris      F   
1           1  750-09-7342  6011237648640631  Amanda  Morris      F   
2           2  750-09-7342  6011237648640631  Amanda  Morris      F   
3           3  750-09-7342  6011237648640631  Amanda  Morris      F   
4           4  750-09-7342  6011237648640631  Amanda  Morris      F   

             street   city state    zip  ...  \
0  144 Bowers Route  Tracy    CA  95376  ...   
1  144 Bowers Route  Tracy    CA  95376  ...   
2  144 Bowers Route  Tracy    CA  95376  ...   
3  144 Bowers Route  Tracy    CA  95376  ...   
4  144 Bowers Route  Tracy    CA  95376  ...   

                          trans_num  trans_date  trans_time   unix_time  \
0  52226c099d060def61a7e0d3de214e4b  2020-02-15    02:48:17  1581715097   
1  6495d95c66d7d504d7352f2b8c105d3c  2020-02-15    01:29:36  1581710376   
2  bc0bfcd9e013

In [6]:
# Check for the target variable (assuming 'Class' for fraud)
if 'Class' not in df.columns:
    print("\nWARNING: 'Class' column (target variable for fraud) not found.")
    print("Please ensure your fraud target column is named 'Class' or adjust the code.")
else:
    print("\nFraud Class Distribution:")
    print(df['Class'].value_counts())
    print(f"Fraud Percentage: {df['Class'].value_counts(normalize=True)[1]*100:.4f}%")


# --- Uniting Tables (N/A for single file) ---
print("\n--- Uniting Tables: Not applicable for a single CSV file. ---")


# --- Initial Data Cleaning and Transformation ---
print("\n--- Initial Data Cleaning and Transformation ---")

# The 'Time' column usually represents the seconds elapsed between the first transaction in the dataset and the current transaction.
# It's not a datetime object directly, but we can transform it or use it as is.
# For now, we'll keep it as is, or you might consider scaling it later.
# If you wanted to create 'real' time features, you'd need a reference start_datetime.
# For simplicity in this dataset, 'Time' is usually treated as a numerical feature.

# There are typically no text fields in this dataset, so text cleaning is not needed.
print("No specific text cleaning required for this dataset's columns.")

# No large categories to reduce as most features are numerical (V1-V28, Amount, Time).
print("No large categorical features to reduce in this dataset.")

# The dataset is often pre-processed with anonymized features (V1-V28).
# These are usually already scaled. The 'Amount' feature, however, typically is not.
# We will typically scale 'Amount' later in the Feature Engineering/Scaling stage.

# --- Check for Missing Values (initial check) ---
print("\n--- Initial Missing Value Check ---")
missing_values_count = df.isnull().sum()
missing_values_percentage = (df.isnull().sum() / len(df)) * 100
missing_info = pd.DataFrame({
    'Missing Count': missing_values_count,
    'Missing %': missing_values_percentage
})
missing_info = missing_info[missing_info['Missing Count'] > 0].sort_values(by='Missing %', ascending=False)

if not missing_info.empty:
    print("Columns with Missing Values:\n", missing_info)
else:
    print("No missing values found in the dataset at this stage.")


print("\n--- Data Preparation Complete ---")
print("Shape of final prepared DataFrame:", df.shape)
print("Columns in final DataFrame:\n", df.columns.tolist())
print("First 5 rows of the prepared DataFrame:\n", df.head())

# The 'df' DataFrame is now ready for EDA and further processing.


Please ensure your fraud target column is named 'Class' or adjust the code.

--- Uniting Tables: Not applicable for a single CSV file. ---

--- Initial Data Cleaning and Transformation ---
No specific text cleaning required for this dataset's columns.
No large categorical features to reduce in this dataset.

--- Initial Missing Value Check ---
No missing values found in the dataset at this stage.

--- Data Preparation Complete ---
Shape of final prepared DataFrame: (4033120, 27)
Columns in final DataFrame:
 ['Unnamed: 0', 'ssn', 'cc_num', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'acct_num', 'profile', 'trans_num', 'trans_date', 'trans_time', 'unix_time', 'category', 'amt', 'is_fraud', 'merchant', 'merch_lat', 'merch_long']
First 5 rows of the prepared DataFrame:
    Unnamed: 0          ssn            cc_num   first    last gender  \
0           0  750-09-7342  6011237648640631  Amanda  Morris      F   
1           1  750-09-

In [7]:
df.to_pickle("cleansing_df.pkl")

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4033120 entries, 0 to 34636377
Data columns (total 27 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   ssn         object 
 2   cc_num      int64  
 3   first       object 
 4   last        object 
 5   gender      object 
 6   street      object 
 7   city        object 
 8   state       object 
 9   zip         int64  
 10  lat         float64
 11  long        float64
 12  city_pop    int64  
 13  job         object 
 14  dob         object 
 15  acct_num    int64  
 16  profile     object 
 17  trans_num   object 
 18  trans_date  object 
 19  trans_time  object 
 20  unix_time   int64  
 21  category    object 
 22  amt         float64
 23  is_fraud    int64  
 24  merchant    object 
 25  merch_lat   float64
 26  merch_long  float64
dtypes: float64(5), int64(7), object(15)
memory usage: 861.6+ MB
