## Data Exploration

In this notebook, we will use Python to better understand the data available. The results will also be compared to SQL query results for validation. 

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Define the path to the data based on input type
def set_csv(input_type):
    valid_types = {
        "data-set-1": "../raw-data/fraud-data-1.csv",
        "data-set-2": "../raw-data/fraud-data-2.csv",
    }
    
    if input_type not in valid_types:
        raise ValueError(f"Invalid type '{input_type}'. Expected one of {list(valid_types.keys())}")
    
    return valid_types[input_type]

In [None]:
# DO THIS: Change csv_name input to data-set-1 or data-set-2
try:
    csv_name = "data-set-1" 
    df = pd.read_csv(set_csv(csv_name))
    # confirm the file we're looking at
    print(f"Successfully read {csv_name.upper()} CSV file. Continuing with analysis...")
except ValueError as e:
    print(e)

In [None]:
# Show summary of the dataframe
df.info()

In [None]:
# display the first 10 rows of the dataframe
df.head(10)

In [None]:
# Find how many unique values are in each column
df.nunique()

In [None]:
# Standardize column names
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

# Convert dates
df['order_date'] = pd.to_datetime(df['order_date'])

In [None]:
if csv_name == "data-set-1":
    # Flag false positives (flagged as fraud but approved with no chargeback)
    df['false_positive'] = ((df['flagged_as_fraud'] == 'Yes') &
                            (df['result'] == 'Approved') &
                            (df['chargeback_(yes/no)'] == 'No'))

    # Flag false negatives (not flagged as fraud but resulted in chargeback)
    df['false_negative'] = ((df['flagged_as_fraud'] == 'No') &
                            (df['chargeback_(yes/no)'] == 'Yes'))

    print("False Positives:", df['false_positive'].sum())
    print("False Negatives:", df['false_negative'].sum())

    # Chargebacks by payment method
    payment_type = df[df['chargeback_(yes/no)'] == 'Yes']['payment_method'].value_counts()
    print("Payment method chargeback count:", payment_type)

    # Average fraud score by result type
    average_score = df.groupby('result')['fraud_score'].mean()
    print("Average fraud score by result:", average_score)

    # Which agents had the most chargebacks
    agents = df[df['chargeback_(yes/no)'] == 'Yes']['agent_id'].value_counts()
    print("Agent chargeback count:", agents)

    sns.boxplot(data=df, x='result', y='fraud_score')
    plt.title('Fraud Score Distribution by Result')
    plt.show()

In [None]:
if csv_name == "data-set-2":

    # 1. Mismatched country information
    df['mismatched_countries'] = (
        (df['issuer_country'] != df['billing_country']) |
        (df['issuer_country'] != df['ip_country']) |
        (df['billing_country'] != df['ip_country'])
    )

    # 2. Last-minute booking (1 day or less)
    df['last_minute_booking'] = df['booking_days_before_checkin/departure'].astype(float) <= 1

    # 3. High value transactions (top 5%)
    high_value_threshold = df['amount_usd'].astype(float).quantile(0.95)
    df['high_value'] = df['amount_usd'].astype(float) > high_value_threshold

    # 4. International one-way flights
    df['intl_one_way'] = (
        (df['flight_type'] == 'one_way') &
        (df['flight_departure_country'] != df['flight_arrival_country'])
    )

    # 5. Combined fraud risk flag (if 2 or more conditions are met)
    df['potential_fraud'] = (
        df[['mismatched_countries', 'last_minute_booking', 'high_value', 'intl_one_way']]
        .sum(axis=1) > 1
    )

    # Summary outputs
    print("Mismatched countries:", df['mismatched_countries'].sum())
    print("Last-minute bookings:", df['last_minute_booking'].sum())
    print("High-value transactions:", df['high_value'].sum())
    print("International one-way flights:", df['intl_one_way'].sum())
    print("High-risk transactions (2+ signals):", df['potential_fraud'].sum())
