In [None]:
# Install imbalanced-learn
!pip install imbalanced-learn
!pip install pandas
!pip install matplotlib
!pip install seaborn
!pip install scikit-learn
!pip install xgboost
!pip install lightgbm
!pip install catboost


In [2]:
import pandas as pd
import numpy as np

# Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing Libraries
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Model Selection and Evaluation
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')


In [None]:
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

print("Libraries imported successfully!")

In [4]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000) 
# Set pandas options to display all elements of the array
pd.set_option('display.max_rows', None)  # Show all rows
pd.set_option('display.max_columns', None)  # Ensure no column truncation
pd.set_option('display.max_colwidth', None)  # Display full content in each column

In [None]:
# Load the datasets & transaction and identity datasets on 'TransactionID'
# TRAIN
train_transaction = pd.read_csv('/Users/manideepreddyaliminati/Documents/ASUCourses/CSE572/notebook/train_transaction.csv')
train_identity = pd.read_csv('/Users/manideepreddyaliminati/Documents/ASUCourses/CSE572/notebook/train_identity.csv')

#TEST
test_transaction = pd.read_csv('/Users/manideepreddyaliminati/Documents/ASUCourses/CSE572/notebook/test_transaction.csv')
test_identity = pd.read_csv('/Users/manideepreddyaliminati/Documents/ASUCourses/CSE572/notebook/test_identity.csv')

submission_template = pd.read_csv("submission_template.csv")

In [None]:
print("Train Transaction Data:")
display(train_transaction.head(5))

print("Train Identity Data:")
display(train_identity.head(5))

print("test Transaction Data:")
display(test_transaction.head(5))

print("test Identity Data:")
display(test_identity.head(5))

In [7]:
columns_to_remove = [col for col in train_transaction.columns if col.startswith('V')]
train_transaction = train_transaction.drop(columns=columns_to_remove)
test_transaction = test_transaction.drop(columns=columns_to_remove)

In [None]:
# Check for columns with missing values and their counts
missing_cols = train_transaction.isnull().sum()
missing_cols = missing_cols[missing_cols > 0]  # Filter only columns with missing values

# Display the columns with missing values and the percentage of missing data
missing_percentage = (missing_cols / len(train_transaction)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_cols, 'Percentage': missing_percentage})

print("Columns with Missing Values:")
display(missing_data.sort_values(by='Percentage', ascending=False))

In [None]:
# Check for columns with missing values and their counts
missing_cols = test_transaction.isnull().sum()
missing_cols = missing_cols[missing_cols > 0]  # Filter only columns with missing values

# Display the columns with missing values and the percentage of missing data
missing_percentage = (missing_cols / len(test_transaction)) * 100
missing_data = pd.DataFrame({'Missing Values': missing_cols, 'Percentage': missing_percentage})

print("Columns with Missing Values:")
display(missing_data.sort_values(by='Percentage', ascending=False))

In [10]:
train_transaction['hour'] = (train_transaction['TransactionDT'] // 3600) % 24  # Hour of the day (0-23)
train_transaction['day'] = train_transaction['TransactionDT'] // (3600 * 24)   # Relative day count

# Extract day of the week (0 = Monday, 1 = Tuesday, ..., 6 = Sunday)
train_transaction['day_of_week'] = train_transaction['day'] % 7

# Apply the same to test data
train_transaction['hour'] = (train_transaction['TransactionDT'] // 3600) % 24
train_transaction['day'] = train_transaction['TransactionDT'] // (3600 * 24)
train_transaction['day_of_week'] = train_transaction['day'] % 7

test_transaction['hour'] = (test_transaction['TransactionDT'] // 3600) % 24  # Hour of the day (0-23)
test_transaction['day'] = test_transaction['TransactionDT'] // (3600 * 24)   # Relative day count

# Extract day of the week (0 = Monday, 1 = Tuesday, ..., 6 = Sunday)
test_transaction['day_of_week'] = test_transaction['day'] % 7

# Apply the same to test data
test_transaction['hour'] = (test_transaction['TransactionDT'] // 3600) % 24
test_transaction['day'] = test_transaction['TransactionDT'] // (3600 * 24)
test_transaction['day_of_week'] = test_transaction['day'] % 7


In [None]:
train_transaction['day_bin'] = pd.cut(train_transaction['day'], bins=3, labels=False)
test_transaction['day_bin'] = pd.cut(test_transaction['day'], bins=3, labels=False)

# test_data['day_bin'] = pd.cut(test_data['day'], bins=100, labels=False)

# Calculate fraud percentage for each day of the week
fraud_rate_dow = train_transaction.groupby('day_of_week')['isFraud'].mean() * 100

# Plot fraud percentage by day of the week
plt.figure(figsize=(10, 6))
sns.barplot(x=fraud_rate_dow.index, y=fraud_rate_dow, palette='muted')
plt.title('Percentage of Fraud Transactions by Day of the Week')
plt.xlabel('Day of the Week (0 = Monday, 6 = Sunday)')
plt.ylabel('Fraud Percentage (%)')
plt.show()

# Calculate fraud percentage for each day bin
fraud_rate_day_bin = train_transaction.groupby('day_bin')['isFraud'].mean() * 100

# Plot fraud percentage by day bin
plt.figure(figsize=(18, 6))
sns.lineplot(x=fraud_rate_day_bin.index, y=fraud_rate_day_bin, marker='o', linestyle='-', palette='Set1')
plt.title('Percentage of Fraud Transactions by Day Bins (100 Bins)')
plt.xlabel('Day Bin')
plt.ylabel('Fraud Percentage (%)')
plt.show()


# Calculate fraud percentage for each hour of the day
fraud_rate_hour = train_transaction.groupby('hour')['isFraud'].mean() * 100

# Plot fraud percentage by hour
plt.figure(figsize=(12, 6))
sns.barplot(x=fraud_rate_hour.index, y=fraud_rate_hour, palette='coolwarm')
plt.title('Percentage of Fraud Transactions by Hour of the Day')
plt.xlabel('Hour of the Day (0-23)')
plt.ylabel('Fraud Percentage (%)')
plt.show()



In [None]:
train_min = train_transaction['TransactionAmt'].min()
train_max = train_transaction['TransactionAmt'].max()

test_min = test_transaction['TransactionAmt'].min()
test_max = test_transaction['TransactionAmt'].max()

overall_min = min(train_min, test_min)
overall_max = max(train_max, test_max)

overall_min, overall_max

In [None]:
bins =   [0,15,      50,        75,         200,         1000,          3000,       5000,    40000]
labels = ['Micro', 'Small', 'Medium', 'Large-Low', 'Large-Mid', 'Large-Upper', 'High-Low', 'High-Upper']

train_transaction['TransactionAmt_bin'] = pd.cut(
    train_transaction['TransactionAmt'], bins=bins, labels=labels, include_lowest=True
)
test_transaction['TransactionAmt_bin'] = pd.cut(
    test_transaction['TransactionAmt'], bins=bins, labels=labels, include_lowest=True
)

print(train_transaction['TransactionAmt_bin'].value_counts())
print(test_transaction['TransactionAmt_bin'].value_counts())


from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply encoding on train and test
train_transaction['TransactionAmt_bin_encoded'] = label_encoder.fit_transform(train_transaction['TransactionAmt_bin'])
test_transaction['TransactionAmt_bin_encoded'] = label_encoder.transform(test_transaction['TransactionAmt_bin'])

# Check the encoded values
print(train_transaction[['TransactionAmt_bin', 'TransactionAmt_bin_encoded']].head())
print(test_transaction[['TransactionAmt_bin', 'TransactionAmt_bin_encoded']].head())





In [None]:
# List of C features
c_feat = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
          'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']

# Compute the absolute correlation matrix for C features
corr_matrix = train_transaction[c_feat].corr().abs()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5, square=True)
plt.title('Correlation Heatmap of Selected Features')
plt.show()

# Select the upper triangle of the correlation matrix to avoid duplicate pairs
upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# List all pairs with high correlation (e.g., correlation > 0.9)
high_corr_pairs = [(col1, col2, upper_triangle.loc[col1, col2]) 
                   for col1 in upper_triangle.columns 
                   for col2 in upper_triangle.index 
                   if upper_triangle.loc[col1, col2] > 0.9]

# Display the pairs with high correlation
print("Highly Correlated Column Pairs (C1 to C14, correlation > 0.9):")
for col1, col2, corr in high_corr_pairs:
    print(f"{col1} - {col2}: {corr:.2f}")


In [None]:
def auto_drop_high_corr_columns(data, features, threshold=0.9):
    # Compute the absolute correlation matrix for the given features
    corr_matrix = data[features].corr().abs()
    
    # Select the upper triangle of the correlation matrix
    upper_triangle = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

    # Find columns to drop (keep only the first occurrence in correlated pairs)
    to_drop = [column for column in upper_triangle.columns if any(upper_triangle[column] > threshold)]
    
    return to_drop

# List of C features
c_feat = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7',
          'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14']

# Get the list of columns to drop
columns_to_drop = auto_drop_high_corr_columns(train_transaction, c_feat, threshold=0.9)

print(columns_to_drop)

train_transaction.drop(columns=[col for col in columns_to_drop if col in train_transaction], inplace=True)
test_transaction.drop(columns=[col for col in columns_to_drop if col in test_transaction], inplace=True)

In [None]:
missing_percentage = train_transaction.isnull().mean() * 100

print(missing_percentage)

# Drop features with >50% missing values
features_to_drop = missing_percentage[missing_percentage > 50].index.tolist()
print(f"Features to Drop: {features_to_drop}")
train_transaction.drop(columns=[col for col in features_to_drop if col in train_transaction], inplace=True)
test_transaction.drop(columns=[col for col in features_to_drop if col in test_transaction], inplace=True)

In [None]:
# 1. Fill D1-D15 (Timedeltas) with mean or 0 if missing means no previous transaction
for col in ['D1', 'D2', 'D3', 'D4', 'D10', 'D11', 'D15']:
    train_transaction[col].fillna(train_transaction[col].mean(), inplace=True)
    test_transaction[col].fillna(test_transaction[col].mean(), inplace=True)

# 2. Fill M1-M9 (Binary/Match Features) with the mode (most frequent value)
for col in ['M1', 'M2', 'M3', 'M4', 'M6']:
    train_transaction[col].fillna(train_transaction[col].mode()[0], inplace=True)
    test_transaction[col].fillna(test_transaction[col].mode()[0], inplace=True)

# 3. Fill Card1-Card6 with the mode or label as 'unknown'
for col in ['card2', 'card3', 'card4', 'card5', 'card6']:
    train_transaction[col].fillna(train_transaction[col].mode()[0], inplace=True)
    test_transaction[col].fillna(test_transaction[col].mode()[0], inplace=True)

# 4. Fill Address Fields (addr1, addr2) with 'unknown' if missing
train_transaction['addr1'].fillna('unknown', inplace=True)
train_transaction['addr2'].fillna('unknown', inplace=True)

test_transaction['addr1'].fillna('unknown', inplace=True)
test_transaction['addr2'].fillna('unknown', inplace=True)

# 5. Fill P_emaildomain with the mode (most common email domain)
train_transaction['P_emaildomain'].fillna(train_transaction['P_emaildomain'].mode()[0], inplace=True)
test_transaction['P_emaildomain'].fillna(test_transaction['P_emaildomain'].mode()[0], inplace=True)

# Verify no nulls remain in these columns
print(train_transaction[['D1', 'M1', 'card2', 'addr1', 'P_emaildomain']].isnull().sum())
print(test_transaction[['D1', 'M1', 'card2', 'addr1', 'P_emaildomain']].isnull().sum())


In [18]:
# # import pandas as pd
# # from sklearn.impute import KNNImputer
# # from sklearn.preprocessing import StandardScaler

# # # Select only the relevant columns for imputation
# # train_addr = train_transaction[['addr1', 'addr2']]
# # test_addr = test_transaction[['addr1', 'addr2']]

# # # Standardize the columns to ensure KNN works effectively
# # scaler = StandardScaler()
# # train_addr_scaled = scaler.fit_transform(train_addr)
# # test_addr_scaled = scaler.transform(test_addr)

# # # Initialize the KNN imputer with a specified number of neighbors
# # imputer = KNNImputer(n_neighbors=5)

# # # Impute missing values
# # train_addr_imputed = imputer.fit_transform(train_addr_scaled)
# # test_addr_imputed = imputer.transform(test_addr_scaled)

# # # Assign the imputed values back to the original columns
# # train_transaction[['addr1', 'addr2']] = scaler.inverse_transform(train_addr_imputed)
# # test_transaction[['addr1', 'addr2']] = scaler.inverse_transform(test_addr_imputed)

# import pandas as pd
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import StandardScaler, LabelEncoder

# # Make copies of the data to avoid modifying the original directly
# train_transaction = train_transaction.copy()
# test_transaction = test_transaction.copy()

# # Step 1: Label encode P_emaildomain to handle it as a numeric value
# email_encoder = LabelEncoder()

# # Fit the encoder on the combined dataset to handle any unique values in either dataset
# combined_emaildomain = pd.concat([train_transaction['P_emaildomain'], test_transaction['P_emaildomain']])
# email_encoder.fit(combined_emaildomain.dropna())

# # Transform P_emaildomain in both train and test, setting NaN where originally missing
# train_transaction['P_emaildomain_encoded'] = email_encoder.transform(
#     train_transaction['P_emaildomain'].fillna('unknown')
# )
# test_transaction['P_emaildomain_encoded'] = email_encoder.transform(
#     test_transaction['P_emaildomain'].fillna('unknown')
# )

# # Revert any imputed 'unknown' back to NaN to prepare for KNNImputer
# train_transaction.loc[train_transaction['P_emaildomain'].isnull(), 'P_emaildomain_encoded'] = pd.NA
# test_transaction.loc[test_transaction['P_emaildomain'].isnull(), 'P_emaildomain_encoded'] = pd.NA

# # Step 2: Select columns for imputation and scale them
# train_impute_data = train_transaction[['addr1', 'addr2', 'P_emaildomain_encoded']]
# test_impute_data = test_transaction[['addr1', 'addr2', 'P_emaildomain_encoded']]

# # Standardize the data
# scaler = StandardScaler()
# train_scaled = scaler.fit_transform(train_impute_data)
# test_scaled = scaler.transform(test_impute_data)

# # Step 3: Apply KNN Imputer
# imputer = KNNImputer(n_neighbors=5)
# train_imputed = imputer.fit_transform(train_scaled)
# test_imputed = imputer.transform(test_scaled)

# # Step 4: Inverse transform the scaled data back to original scale
# train_imputed_data = scaler.inverse_transform(train_imputed)
# test_imputed_data = scaler.inverse_transform(test_imputed)

# # Replace the original columns with imputed values
# train_transaction[['addr1', 'addr2', 'P_emaildomain_encoded']] = train_imputed_data
# test_transaction[['addr1', 'addr2', 'P_emaildomain_encoded']] = test_imputed_data

# # Step 5: Decode the P_emaildomain from encoded values back to original categories
# train_transaction['P_emaildomain'] = email_encoder.inverse_transform(
#     train_transaction['P_emaildomain_encoded'].round().astype(int)
# )
# test_transaction['P_emaildomain'] = email_encoder.inverse_transform(
#     test_transaction['P_emaildomain_encoded'].round().astype(int)
# )

# # Drop temporary encoded columns
# train_transaction.drop(columns=['P_emaildomain_encoded'], inplace=True)
# test_transaction.drop(columns=['P_emaildomain_encoded'], inplace=True)


In [None]:
missing_percentage = train_transaction.isnull().mean() * 100
print(missing_percentage[missing_percentage > 0].index.tolist())

missing_percentage = test_transaction.isnull().mean() * 100
print(missing_percentage[missing_percentage > 0].index.tolist())

In [20]:
for col in ['C1', 'C3', 'C5', 'C13']:
    test_transaction[col].fillna(-1, inplace=True)

In [21]:
# import pandas as pd
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import StandardScaler

# # Step 1: Select the columns for KNN imputation
# columns_to_impute = ['C1', 'C3', 'C5', 'C13']
# test_data_impute = test_transaction[columns_to_impute]

# # Step 2: Standardize the selected columns for KNN compatibility
# scaler = StandardScaler()
# test_data_scaled = scaler.fit_transform(test_data_impute)

# # Step 3: Apply KNN Imputer
# imputer = KNNImputer(n_neighbors=5)
# test_data_imputed = imputer.fit_transform(test_data_scaled)

# # Step 4: Inverse transform the scaled data back to original scale
# test_data_imputed = scaler.inverse_transform(test_data_imputed)

# # Replace the original columns with the imputed values
# test_transaction[columns_to_impute] = test_data_imputed


In [22]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to M1 to M4 columns
for col in ['M1', 'M2', 'M3', 'M4', "M6"]:
    train_transaction[col] = label_encoder.fit_transform(train_transaction[col])
    test_transaction[col] = label_encoder.fit_transform(test_transaction[col])
    
for col in ['card4', 'card6']:
    encoder = LabelEncoder()
    train_transaction[col] = encoder.fit_transform(train_transaction[col].astype(str))
    test_transaction[col] = encoder.fit_transform(test_transaction[col].astype(str))

# Replace 'unknown' with -1
train_transaction[['addr1', 'addr2']] = train_transaction[['addr1', 'addr2']].replace('unknown', -1)
test_transaction[['addr1', 'addr2']] = test_transaction[['addr1', 'addr2']].replace('unknown', -1)

# Apply label encoding
encoder = LabelEncoder()
for col in ['addr1', 'addr2']:
    train_transaction[col] = encoder.fit_transform(train_transaction[col].astype(str))
    test_transaction[col] = encoder.fit_transform(test_transaction[col].astype(str))



In [None]:
import pandas as pd

# Define the categories with their respective email domains
categories = {
    "Free_Email_Providers": {
        "gmail.com", "gmail", "outlook.com", "hotmail.com", "live.com", "live.com.mx",
        "live.fr", "hotmail.es", "hotmail.fr", "hotmail.de", "hotmail.co.uk", 
        "outlook.es", "yahoo.com", "ymail.com", "rocketmail.com", "yahoo.fr",
        "yahoo.de", "yahoo.es", "yahoo.co.uk", "yahoo.co.jp", "yahoo.com.mx", 
        "aol.com", "mail.com"
    },
    "ISP_Email_Domains": {
        "verizon.net", "comcast.net", "optonline.net", "cox.net", "charter.net", 
        "sbcglobal.net", "bellsouth.net", "earthlink.net", "windstream.net", 
        "frontiernet.net", "frontier.com", "centurylink.net", "suddenlink.net", 
        "cableone.net", "twc.com", "cfl.rr.com", "sc.rr.com", "roadrunner.com", 
        "att.net"
    },
    "Legacy_Email": {
        "hotmail.com", "msn.com", "aol.com", "juno.com", "prodigy.net.mx", 
        "embarqmail.com", "netzero.net", "netzero.com", "aim.com", "q.com"
    },
    "Business_Email_Domains": {
        "servicios-ta.com"
    },
    "Private_Emails": {
        "protonmail.com", "anonymous.com"
    },
    "Country_Specific": {
        "gmx.de", "web.de", "yahoo.de", "hotmail.de", 
        "yahoo.fr", "hotmail.fr", "live.fr", 
        "yahoo.es", "hotmail.es", 
        "yahoo.co.uk", "hotmail.co.uk", 
        "prodigy.net.mx", "yahoo.com.mx", "live.com.mx"
    },
    "Apple_Domains": {
        "me.com", "icloud.com", "mac.com"
    },
    "Miscellaneous_Providers": {
        "ptd.net", "embarqmail.com", "rocketmail.com"
    }
}

def classify_email_domain(domain):
    """Classify an email domain into one or more predefined categories."""
    matched_categories = set()

    # Check which categories the domain belongs to
    for category, domains in categories.items():
        if domain in domains:
            matched_categories.add(category)
        if category == "Country_Specific_Email_Providers" and domain.endswith(
                ('.de', '.fr', '.es', '.co.uk', '.mx')):
            matched_categories.add(category)

    if not matched_categories:
        matched_categories.add("Unknown_or_Private_Domain")
    return matched_categories

def classify_emails_in_df(df, email_column):
    """Add binary-encoded category columns to the DataFrame based on email domains."""
    # Extract the domain from the email column
    df['domain'] = df[email_column].str.lower().fillna("Invalid_Email_Format")

    # Initialize category columns with 0
    all_categories = list(categories.keys()) + ["Unknown_or_Private_Domain", "Invalid_Email_Format"]
    for category in all_categories:
        df[category] = 0

    # Process each email domain
    for index, row in df.iterrows():
        domain = row['domain']
        if domain == "Invalid_Email_Format":
            matched_categories = {"Invalid_Email_Format"}
        else:
            matched_categories = classify_email_domain(domain)
        
        for category in matched_categories:
            df.at[index, category] = 1

    # Drop the temporary 'domain' column
    df.drop('domain', axis=1, inplace=True)

    return df

# Apply the function to classify email domains
train_transaction = classify_emails_in_df(train_transaction, 'P_emaildomain')
test_transaction = classify_emails_in_df(test_transaction, 'P_emaildomain')
# Display the updated DataFrame
print(train_transaction)
print("--------")
print(test_transaction)


In [None]:
# List of columns to drop manually
manual_cols_to_drop = ['ProductCD', 'P_emaildomain', 'TransactionDT', 'day', 'TransactionAmt_bin', 'TransactionAmt'] # ID columns are not required

# Drop the columns
train_transaction.drop(columns=[col for col in manual_cols_to_drop if col in train_transaction], inplace=True)
test_transaction.drop(columns=[col for col in manual_cols_to_drop if col in test_transaction], inplace=True)

# Verify the remaining columns
print("Remaining columns in Train Data:")
print(train_transaction.columns)

# Verify the remaining columns
print("Remaining columns in test Data:")
print(test_transaction.columns)

In [None]:
# Identify categorical columns in train_data
categorical_cols = train_transaction.select_dtypes(include=['object', 'category']).columns

# Display all categorical columns
print("Categorical Columns in Train Transaction Data:")
print(list(categorical_cols))

# Identify categorical columns in test_data
categorical_cols = test_transaction.select_dtypes(include=['object', 'category']).columns

# Display all categorical columns
print("Categorical Columns in Test Transaction Data:")
print(list(categorical_cols))


In [None]:
# Compute absolute correlation matrix
corr_matrix = train_transaction.corr().abs()
# Set a threshold for high correlation
threshold = 0.75

# Identify column pairs with high correlation
high_corr_pairs = [
    (col1, col2, corr_matrix.loc[col1, col2])
    for col1 in corr_matrix.columns
    for col2 in corr_matrix.columns
    if col1 != col2 and corr_matrix.loc[col1, col2] > threshold
]

# Set up the matplotlib figure
plt.figure(figsize=(12, 8))

# Generate a mask for the upper triangle to avoid redundancy
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr_matrix, cmap='coolwarm', vmax=1.0, vmin=0.0, 
            center=0.5, annot=False, linewidths=0.5, square=True)

plt.title('Correlation Matrix Heatmap', fontsize=15)
plt.show()

print(high_corr_pairs)


In [None]:
# Identity Data

null_counts = train_identity.isna().sum()
null_percentage = (null_counts / len(train_identity)) * 100

# Create a DataFrame summarizing the null values
null_summary = pd.DataFrame({'Null Values': null_counts, 'Percentage': null_percentage})

# Display the columns sorted by the percentage of null values (descending)
print("Null Values Summary:")
display(null_summary.sort_values(by='Percentage', ascending=False))

# Identity Data

null_counts = test_identity.isna().sum()
null_percentage = (null_counts / len(test_identity)) * 100

# Create a DataFrame summarizing the null values
null_summary = pd.DataFrame({'Null Values': null_counts, 'Percentage': null_percentage})

# Display the columns sorted by the percentage of null values (descending)
print("Null Values Summary:")
display(null_summary.sort_values(by='Percentage', ascending=False))

In [None]:
# Fill with the most frequent value
most_frequent_type = train_identity['DeviceType'].mode()[0]
train_identity['DeviceType'].fillna(most_frequent_type, inplace=True)

# Get mode for DeviceInfo separately for mobile and desktop
mobile_mode = train_identity[train_identity['DeviceType'] == 'mobile']['DeviceInfo'].mode()[0]
desktop_mode = train_identity[train_identity['DeviceType'] == 'desktop']['DeviceInfo'].mode()[0]

# Function to fill missing DeviceInfo based on DeviceType
def fill_device_info(row):
    if pd.isnull(row['DeviceInfo']):
        if row['DeviceType'] == 'mobile':
            return mobile_mode
        elif row['DeviceType'] == 'desktop':
            return desktop_mode
        else:
            return 'Unknown'  # Handle cases where DeviceType is also missing
    return row['DeviceInfo']

# Apply the function to fill missing DeviceInfo values
train_identity['DeviceInfo'] = train_identity.apply(fill_device_info, axis=1)

# Verify the result
print(train_identity['DeviceInfo'].isnull().sum())  # Should be 0 if all missing values are filled


In [None]:
# Fill with the most frequent value
most_frequent_type = test_identity['DeviceType'].mode()[0]
test_identity['DeviceType'].fillna(most_frequent_type, inplace=True)

# Get mode for DeviceInfo separately for mobile and desktop
mobile_mode = test_identity[test_identity['DeviceType'] == 'mobile']['DeviceInfo'].mode()[0]
desktop_mode = test_identity[test_identity['DeviceType'] == 'desktop']['DeviceInfo'].mode()[0]

# Function to fill missing DeviceInfo based on DeviceType
def fill_device_info(row):
    if pd.isnull(row['DeviceInfo']):
        if row['DeviceType'] == 'mobile':
            return mobile_mode
        elif row['DeviceType'] == 'desktop':
            return desktop_mode
        else:
            return 'Unknown'  # Handle cases where DeviceType is also missing
    return row['DeviceInfo']

# Apply the function to fill missing DeviceInfo values
test_identity['DeviceInfo'] = test_identity.apply(fill_device_info, axis=1)

# Verify the result
print(test_identity['DeviceInfo'].isnull().sum())  # Should be 0 if all missing values are filled


In [None]:
import re

# Step 1: Function to Extract Device Brand or Name
def extract_device_name(device_info):
    # Convert to string and lower case for uniformity
    device_info = str(device_info).lower() 
    sony_models = ['e6853', 'sov33', 'd5803', 'g3313', 'f3213', 'f3113', 'g3223', 'f5121','f3313', 'e6603', "e2306", "f8331", "f3111", "d6603", "f5321", "e5506", "e5823", "g3123", "c6906", "e5803", "e2104", "e6553", "d5306", "d5316", "e6810", "e5306", "g8141", "g3423", "f3111 build/33.3.a.1.115", "e6683", "e6633", "e6653", "c6903", "lt22i", "lt30p", "sgp521", "sgp511", "sgp611", "sgp621", "f8131", "f5122", "f8332", "e2006", "d6708", "d6503", "e6683", "e5606", "f5121", "e5306 build/27.3.a.0.129", "e5306 build/27.3.a.0.165", "g3123 build/40.0.a.6.175", "g3123 build/40.0.a.6.189", "g8141 build/47.1.a.5.51", "d5316 build/19.4.a.0.182", "d5306 build/19.4.a.0.182", "e2104 build/24.0.a.5.14", "d2306 build/18.6.a.0.182", "e5823 build/32.4.a.1.54", "e6653 build/32.4.a.1.54", "c6603", "e5803", "e5823", "e6553", "g8142", "g8341", "f8331 build/41.2.a.7.76", "f5321 build/34.3.a.0.238", "g3123", "e2303", "d6603 build/23.5.a.1.291", "sgp621 build/23.5.a.1.291", "e2306", "f8331", "f3111", "d6603", "f5321", "e5506", "e5823", "g3123", "c6906", "e5803", "e2104", "e6553", "d5306", "d5316", "e6810", "e5306", "g8141", "g3423", "f3111 build/33.3.a.1.115", "e6683", "e6633", "e6653", "c6903", "lt22i", "lt30p", "sgp521", "sgp511", "sgp611", "sgp621", "f8131", "f5122", "f8332", "e2006", "d6708", "d6503", "e5606", "f5121", "e5306 build/27.3.a.0.129", "e5306 build/27.3.a.0.165", "g3123 build/40.0.a.6.175", "g3123 build/40.0.a.6.189", "g8141 build/47.1.a.5.51", "d5316 build/19.4.a.0.182", "d5306 build/19.4.a.0.182", "e2104 build/24.0.a.5.14", "d2306 build/18.6.a.0.182", "e5823 build/32.4.a.1.54", "e6653 build/32.4.a.1.54", "g8142", "g8341", "f8331 build/41.2.a.7.76", "f5321 build/34.3.a.0.238", "e2303", "d6603 build/23.5.a.1.291", "sgp621 build/23.5.a.1.291"]
    alacatel_models = ['6045i','5095i' '5080a', '5010g', '8050g', '5025g', '5015a', '5054s','5056a', '5012g', "4047g", "4013m", "5010s", "5085b", "5049w", "9008a", "9003a", "4047a", "5051a", "5045i", "6039a", "7048a", "5056n", "5042a", "5017a", "4034g", "4034e", "4009f", "4027a", "5057m", "6037b"]
    amz_fire_models = ["kftuwi", "kfsnwi", "kfrawi", "kfrapwi", "kfquwi", "kftrwi", "kftrpwi", "kfonwi", "kfmawi", "kfmuwi", "kfkawi", "kfsuwi", "kfdowi", "kfauwi", "kfgiwi", "kftbwi", "kfmewi", "kffowi", "kfsawa", "kfsawi", "kfaswi", "kfarwi", "kfthwa", "kfthwi", "kfapwa", "kfapwi", "kfsowi", "kfot", "kftt", "kfjwa", "kfjwi", "kindle fire"];
    # Nokia
    nokiaModels = ["ta-1039", "ta-1028", "ta-1044", "ta-1025", "ta-1027", "ta-1003", "ta-1004", "ta-1032", "ta-1020", "ta-1028 build/nmf26o", "ta-1027 build/opr1.170623.026", "ta-1025 build/opr1.170623.026", "ta-1044 build/opr1.170623.026", "ta-1038", "ta-1038 build/o00623", "ta-1028 build/o00623", "ta-1027 build/n2g47h"]
    # BLU Products
    bluModels = ["blu", "blu studio c 5+5", "blu energy x plus", "blu life xl", "studio", "studio_g_hd", "blu energy x 2 build/e050l"]
    # ZTE
    zteModels = ["z981", "z982", "z956", "z813", "z798bl", "z832", "z799vl", "z837vl", "z831", "z963vl", "z965", "z962bl", "z557bl", "z812", "z839", "z836bl", "z955a", "z970", "z959 build/lmy47v", "z833"]
    

    # Huawei/Honor
    huaweiModels = ["hi6210sft", "bln-l21", "bln-l24", "dli-l22", "nem-l51", "sla-l23", "sla-l22", "sla-l02", "bnd-l21", "bnd-l34", "vtr-l29", "was-lx2j", "pe-tl10", "plk-l01", "cam-l23", "mya-l23", "mya-l11", "duk-al20", "kiw-l24", "g620s-l03", "g630-u251", "g527-u081"]

    # OnePlus
    oneplusModels = ["OnePlus", "one a2005", "one a2003", "a0001", "a0001 build/mhc19q", "2ps64 build/nrd90m"]

    # LG
    lgModels = ["k88", "lm-x210(g", "rs988", "h1611", "ls5", "vk700", "vk810", "vk815", "k90u", "p5006a", "p5046a"]

    # Oppo
    oppoModels = ["Oppo/Vivo", "a37f", "a96", "f1f", "cph1701", "cph1607", "cph1723"]

    # HTC
    htcModels = ["2ps64", "0pm92", "2pzc5", "0paj5", "0pja2", "2pq93", "2pyb2"]

    # BlackBerry
    blackberryModels = ["stv100-4", "stv100-1", "stv100-2", "stv100-3", "bbb100-1", "bbb100-2", "bbb100-3", "bba100-1", "bba100-2"]

    # Asus
    asusModels = ["p008", "p00c", "p00a", "me173x", "me301t", "p027", "t1", "t08", "p01m"]

    # BQ
    bqModels = ["aquaris", "aquaris v", "aquaris x", "aquaris x5 plus", "aquaris u plus", "aquaris_a4.5", "fractal"]

    # LeEco
    leecoModels = ["le x520", "le x829", "lex829"]

    # ZUK
    zukModels = ["z2"]

    # Kyocera
    kyoceraModels = ["e6810", "kyocera-c6742a", "e6790tm", "c6743"]

    # Cat Phones
    catModels = ["s60"]

    # TP-Link
    tplinkModels = ["neffos c5", "neffos x1 max build/nrd90m"]

    # Motorola
    motorolaModels = ["xt1650", "xt1575", "xt1254", "xt1585"]

    # RCA
    rcaModels = ["rct6203w46", "rct6223w87", "rct6303w87m7", "rct6s03w12", "rct6k03w13", "rct6513w87", "rct6773w22b"]

    # Lava
    lavaModels = ["iris 870", "iris 820", "lava_a3", "iris702", "iris50", "iris80"]

    # Wiko
    wikoModels = ["pulp 4g", "highway", "fever", "u feel lite build/mra58k"]

    # Crosscall
    crosscallModels = ["trekker-x3", "trekker-m1"]

    # Vivo
    vivoModels = ["Oppo/Vivo", "v502015"]

    # Dell
    dellModels = ["venue"]

    # Lenovo
    lenovoModels = ["mot-a6020l37", "ideatab", "ideataba1000-f", "ideataba2109a", "a1-850", "a3-a20", "b1-790", "b1-750", "b1-810", "tab2a7-10f", "yoga", "ple-701l"]

    # Blackview
    blackviewModels = ["bv7000", "bv6000", "bv8000pro"]

    # Verykool
    verykoolModels = ["verykool", "s471"]

    # Archos
    archosModels = ["archos"]

    # Gigaset
    gigasetModels = ["gigaset"]

    # Coolpad
    coolpadModels = ["coolpad"]

    # Leagoo
    leagooModels = ["leagoo kiicaa mix"]

    # Nvidia
    nvidiaModels = ["shield"]  # If present in data

    # Ramos
    ramosModels = ["ramos"]




    # Define patterns to extract meaningful device names
    if 'samsung' in device_info or 'sm-' in device_info or 'gt-' in device_info or 'sgh-' in device_info or 'sch-' in device_info or 'sgh-' in device_info:
        return 'Samsung'
    elif 'redmi' in device_info or 'mi' in device_info:
        return 'Xiaomi'
    elif 'moto' in device_info or 'motorola' in device_info:
        return 'Motorola'
    elif 'pixel' in device_info:
        return 'Google Pixel'
    elif 'htc' in device_info:
        return 'HTC'
    elif 'lg' in device_info:
        return 'LG'
    elif 'huawei' in device_info:
        return 'Huawei'
    elif 'iphone' in device_info or 'ios' in device_info:
        return 'Apple'
    elif 'nexus' in device_info:
        return 'Google Nexus'
    elif 'lenovo' in device_info:
        return 'Lenovo'
    elif 'oneplus' in device_info:
        return 'OnePlus'
    elif 'windows' in device_info:
        return 'Windows'
    elif 'alcatel' in device_info or 'one touch' in device_info or any(model in device_info for model in alacatel_models) :
        return 'Alcatel'
    elif 'sony' in device_info or 'xperia' in device_info or any(model in device_info for model in sony_models) :
        return 'Sony'
    elif 'nokia' in device_info or any(model in device_info for model in nokiaModels):
        return 'Nokia'
    
    ###
    
    elif any(model.lower() in device_info for model in bluModels):
        return 'Blu'
    elif any(model.lower() in device_info for model in zteModels):
        return 'ZTE'
    elif any(model.lower() in device_info for model in huaweiModels):
        return 'Huawei'
    elif any(model.lower() in device_info for model in oneplusModels):
        return 'OnePlus'
    elif any(model.lower() in device_info for model in lgModels):
        return 'LG'
    elif any(model.lower() in device_info for model in oppoModels):
        return 'Oppo'
    if any(model.lower() in device_info for model in htcModels):
        return 'HTC'
    elif any(model.lower() in device_info for model in blackberryModels):
        return 'BlackBerry'
    elif any(model.lower() in device_info for model in asusModels):
        return 'Asus'
    elif any(model.lower() in device_info for model in bqModels):
        return 'BQ'
    elif any(model.lower() in device_info for model in leecoModels):
        return 'LeEco'
    elif any(model.lower() in device_info for model in zukModels):
        return 'ZUK'
    elif any(model.lower() in device_info for model in kyoceraModels):
        return 'Kyocera'
    elif any(model.lower() in device_info for model in catModels):
        return 'CAT'
    elif any(model.lower() in device_info for model in tplinkModels):
        return 'TP-Link'
    elif any(model.lower() in device_info for model in motorolaModels):
        return 'Motorola'
    elif any(model.lower() in device_info for model in rcaModels):
        return 'RCA'
    elif any(model.lower() in device_info for model in lavaModels):
        return 'Lava'
    elif any(model.lower() in device_info for model in wikoModels):
        return 'Wiko'
    elif any(model.lower() in device_info for model in crosscallModels):
        return 'Crosscall'
    elif any(model.lower() in device_info for model in vivoModels):
        return 'Vivo'
    elif any(model.lower() in device_info for model in dellModels):
        return 'Dell'
    elif any(model.lower() in device_info for model in lenovoModels):
        return 'Lenovo'
    elif any(model.lower() in device_info for model in blackviewModels):
        return 'Blackview'
    elif any(model.lower() in device_info for model in verykoolModels):
        return 'Verykool'
    elif any(model.lower() in device_info for model in archosModels):
        return 'Archos'
    elif any(model.lower() in device_info for model in gigasetModels):
        return 'Gigaset'
    elif any(model.lower() in device_info for model in coolpadModels):
        return 'Coolpad'
    elif any(model.lower() in device_info for model in leagooModels):
        return 'Leagoo'
    elif any(model.lower() in device_info for model in nvidiaModels):
        return 'Nvidia'
    elif any(model.lower() in device_info for model in ramosModels):
        return 'Ramos'
    ###
    
    elif 'oppo' in device_info or 'vivo' in device_info:
        return 'Oppo/Vivo'
    elif 'asus' in device_info:
        return 'Asus'
    elif 'zte' in device_info:
        return 'ZTE'
    elif 'tablet' in device_info:
        return 'Tablet'
    elif 'linux' in device_info:
        return 'Linux'
    elif 'hisense' in device_info:
        return 'Hisense'
    elif 'blade' in device_info or 'z983 ' in device_info:
        return 'ZTE'
    elif 'ilium ' in device_info:
        return 'Lanix Ilium'
    elif 'm4 ' in device_info:
        return 'M4tel'
    elif device_info.startswith('xt'):
        if 'sony' in device_info.lower():
            return 'Sony'
        return 'Motorola'
    elif any(model in device_info for model in amz_fire_models):
        return 'Amazon Fire'
    elif device_info.startswith('vs'):
        return 'LG'
    elif device_info.startswith('verykools'):
        return 'Verykool'
    else:
        # Default to 'Other' if no meaningful match is found
        return 'Other'

# Step 2: Apply the Function to the 'DeviceInfo' Column
train_identity['device_name'] = train_identity['DeviceInfo'].apply(extract_device_name)

test_identity['device_name'] = test_identity['DeviceInfo'].apply(extract_device_name)
# Verify the Results
# print(train_identity[['DeviceInfo', 'device_name']].head(20))

device_name_counts = train_identity['device_name'].value_counts()

print(len(device_name_counts))
print(device_name_counts.head(10))


In [None]:
# Identity Data

null_counts = train_identity.isna().sum()
null_percentage = (null_counts / len(train_identity)) * 100

# Create a DataFrame summarizing the null values
null_summary = pd.DataFrame({'Null Values': null_counts, 'Percentage': null_percentage})

# Display the columns sorted by the percentage of null values (descending)
print("Null Values Summary:")
display(null_summary.sort_values(by='Percentage', ascending=False))

# Identify columns with >95% missing values
columns_to_drop = null_summary[null_summary['Percentage'] > 95].index

# Drop these columns
train_identity.drop(columns=[col for col in columns_to_drop if col in train_identity], inplace=True)
test_identity.drop(columns=[col for col in columns_to_drop if col in test_identity], inplace=True)

print(f"Dropped columns: {list(columns_to_drop)}")

In [None]:
distinct_device_types = train_identity['DeviceType'].unique()

# Display the distinct values
print("Distinct values in 'DeviceType':")
print(distinct_device_types)

device_type_mapping = {'mobile': 1, 'desktop': 0, 'missing': -1}
train_identity['device_type_numeric'] = train_identity['DeviceType'].map(device_type_mapping)
test_identity['device_type_numeric'] = test_identity['DeviceType'].map(device_type_mapping)


In [None]:
import re

# Step 1: Extract OS Name
def extract_os_name(os):
    # Match the base OS (e.g., 'Android', 'iOS', 'Windows', 'Mac OS X', 'Linux')
    match = re.match(r'(Android|iOS|Mac OS X|Windows|Linux|other|func)', str(os))
    return match.group(0) if match else 'Missing'

# Step 2: Extract OS Version
def extract_os_version(os):
    # Extract version numbers (e.g., '7.0', '10_11_6', '11.2.1')
    match = re.search(r'(\d+[\._]?\d*[\._]?\d*)', str(os))
    return match.group(0) if match else 'Missing'

# Apply the functions to the 'id_30' column
train_identity['os_name'] = train_identity['id_30'].apply(extract_os_name)
train_identity['os_version'] = train_identity['id_30'].apply(extract_os_version)

test_identity['os_name'] = test_identity['id-30'].apply(extract_os_name)
test_identity['os_version'] = test_identity['id-30'].apply(extract_os_version)

# Verify the transformed columns
print(train_identity[['id_30', 'os_name', 'os_version']].head(20))

# Step 1: Extract Browser Name
def extract_browser_name(browser):
    # Match common browser names or keywords
    match = re.match(
        r'(chrome|safari|mobile safari|firefox|edge|opera|samsung browser|ie|android browser|'
        r'google|puffin|waterfox|cyberfox|maxthon|palemoon|iron|silk|facebook|'
        r'chromium|seamonkey|aol|generic)', 
        str(browser).lower()
    )
    return match.group(0) if match else 'Missing'

# Step 2: Extract Browser Version
def extract_browser_version(browser):
    # Extract version numbers (e.g., '62.0', '11.0')
    match = re.search(r'(\d+[\._]?\d*)', str(browser))
    return match.group(0) if match else 'Missing'

# Apply the functions to the 'id_31' column
train_identity['browser_name'] = train_identity['id_31'].apply(extract_browser_name)
train_identity['browser_version'] = train_identity['id_31'].apply(extract_browser_version)

test_identity['browser_name'] = test_identity['id-31'].apply(extract_browser_name)
test_identity['browser_version'] = test_identity['id-31'].apply(extract_browser_version)

# Verify the transformation
print(train_identity[['id_31', 'browser_name', 'browser_version']].head(20))


In [34]:
# Identify numerical and categorical columns
num_cols = train_identity.select_dtypes(include=['float64', 'int64']).columns
cat_cols = train_identity.select_dtypes(include=['object']).columns

# Impute numerical columns with median
for col in num_cols:
    if train_identity[col].isnull().sum() > 0:
        train_identity[col].fillna(-1, inplace=True)
        test_identity[col.replace("_", "-")].fillna(-1, inplace=True)

for col in cat_cols:
    if train_identity[col].isnull().sum() > 0:
        train_identity[col].fillna('Missing', inplace=True)
        test_identity[col.replace("_", "-")].fillna('Missing', inplace=True)
        

In [None]:
# List of columns to drop manually
manual_cols_to_drop = ['id_30', 'id_31', 'id_33', 'DeviceType', 'DeviceInfo'] # ID columns are not required

# # Drop the columns
train_identity.drop(columns=[col for col in manual_cols_to_drop if col in train_identity], inplace=True)
test_identity.drop(columns=[col.replace("_", "-") for col in manual_cols_to_drop if col.replace("_", "-") in test_identity], inplace=True)

# Verify the remaining columns
print("Remaining columns in Train Data:")
print(train_identity.columns)

# Verify the remaining columns
print("Remaining columns in Train Data:")
print(test_identity.columns)




In [36]:
# Replace all '-' with '_' in column names
test_identity.columns = test_identity.columns.str.replace('-', '_', regex=False)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Select all categorical columns
cat_cols = train_identity.select_dtypes(include=['object']).columns

# Apply Label Encoding to each categorical column
for col in cat_cols:
    le = LabelEncoder()
    train_identity[col] = le.fit_transform(train_identity[col])
    test_identity[col] = le.fit_transform(test_identity[col])

# Verify the transformed data
print(train_identity[cat_cols].head())
print(test_identity[cat_cols].head())


# def print_unique_values(df, columns):
#     for col in columns:
#         unique_values = df[col].unique()
#         print(f"Column: {col}, Unique Values: {unique_values}\n")


# print(train_identity, [cat_cols[0]])

In [None]:
corr_matrix = train_identity.corr().abs()

# Set up the matplotlib figure
plt.figure(figsize=(15, 10))

# Draw the heatmap
sns.heatmap(corr_matrix, cmap='coolwarm', annot=True, fmt='.2f', 
            vmax=1.0, vmin=0.0, linewidths=0.5, square=True)

plt.title('Correlation Matrix for Train Identity Data', fontsize=18)
plt.show()

threshold = 0.85

# Identify pairs of columns with high correlation above the threshold
high_corr_pairs = np.where(np.abs(corr_matrix) > threshold)

# Filter out self-correlations (same column correlations)
high_corr_pairs = [
    (corr_matrix.index[i], corr_matrix.columns[j], corr_matrix.iloc[i, j])
    for i, j in zip(*high_corr_pairs)
    if i != j and i < j  # Avoid duplicate pairs and self-correlation
]

# Print high-correlation column pairs
print("Highly Correlated Columns:")
for col1, col2, corr_value in high_corr_pairs:
    print(f"{col1} - {col2} : {corr_value:.2f}")

In [None]:
# List of columns to drop
columns_to_drop = ['id_32', 'id_34', 'id_35', 'id_28', 'id_29']

# Drop the columns if they exist in the DataFrame
train_identity.drop(columns=[col for col in columns_to_drop if col in train_identity.columns], inplace=True)
test_identity.drop(columns=[col for col in columns_to_drop if col in test_identity.columns], inplace=True)

# Verify the remaining columns
print(train_identity.head())
print(test_identity.head())

In [40]:
merged_data = train_transaction.merge(train_identity, on='TransactionID', how='left')
merged_test_data = test_transaction.merge(test_identity, on='TransactionID', how='left')

In [None]:
print(merged_test_data.columns.tolist())
mdc = merged_data.columns.tolist()
mdc.remove("isFraud")
print(mdc)

In [None]:
# Print columns of merged_test_data
print(merged_test_data.columns.tolist())

# Convert columns to lists for easier manipulation
mdc = merged_data.columns.tolist()

# Remove "isFraud" from mdc if it exists
if "isFraud" in mdc:
    mdc.remove("isFraud")

print(mdc)

# Filter merged_test_data to keep only columns present in mdc
merged_test_data = merged_test_data[merged_test_data.columns.intersection(mdc)]

# Print the columns of the filtered DataFrame
print(merged_test_data.columns.tolist())


In [None]:
null_counts = merged_data.isnull().sum()
print(null_counts[null_counts > 0])

merged_data.fillna(-1, inplace=True)

null_counts = merged_test_data.isnull().sum()
print(null_counts[null_counts > 0])

merged_test_data.fillna(-1, inplace=True)



In [44]:
X = merged_data.drop(['isFraud', 'TransactionID'], axis=1)
y = merged_data['isFraud']




In [45]:
original_test_data = pd.read_csv('/Users/manideepreddyaliminati/Documents/ASUCourses/CSE572/notebook/test_transaction.csv')

# Merge back the 'TransactionID' column
merged_test_data['TransactionID'] = original_test_data['TransactionID']

In [None]:
print(merged_test_data.columns)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV


# Step 1: Split train data into features and target
X = merged_data.drop(['isFraud', 'TransactionID'], axis=1)  # Features
y = merged_data['isFraud']  # Target variable

# Step 2: Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print(f"Original dataset shape: {X_train.shape}, {y_train.value_counts()}")
print(f"Resampled dataset shape: {X_train_resampled.shape}, {y_train_resampled.value_counts()}")




In [None]:
# hp tuning randomforest
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200, 300],
    "max_depth": [5, 10, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
}

rf = RandomForestClassifier()
grid_search = GridSearchCV(
    rf, param_grid, cv=5, scoring="roc_auc", n_jobs=-1, verbose=2
)
X_small, _, y_small, _ = train_test_split(
    X_train, y_train, test_size=0.9, stratify=y_train, random_state=42
)
grid_search.fit(X_small, y_small)


In [None]:
# hp tuning randomforest

print("Best parameters found:", grid_search.best_params_)
print("Best cross-validation score(reduced dataset):", grid_search.best_score_)

best_model = grid_search.best_estimator_
# best_model.fit(X_train, y_train)

best_model.fit(X_train_resampled, y_train_resampled)

# Step 4: Make predictions on the validation set
y_pred = best_model.predict(X_val)
y_prob = best_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("Random Forest after hp tuning...")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run predictions on the test set
X_test = merged_test_data.loc[:, merged_test_data.columns != "TransactionID"]
test_predictions = best_model.predict(X_test)
test_prob = best_model.predict_proba(X_test)[:, 1]

# Step 7: Prepare submission DataFrame
submission_rfc = pd.DataFrame(
    {
        "TransactionID": merged_test_data[
            "TransactionID"
        ],  # Use the 'TransactionID' column
        "isFraud": test_prob,  # Use the predictions
    }
)

# Step 8: Save predictions to a CSV file
submission_rfc.to_csv("submission_rfc_2.csv", index=False)

print("Predictions saved to submission_rfc_2.csv")


In [None]:
# hp tuning for xgboost classifier
from skopt import BayesSearchCV
from xgboost import XGBClassifier

param_space = {
    "learning_rate": (0.01, 0.3),
    "max_depth": (3, 10),
    "min_child_weight": (1, 20),
    "subsample": (0.5, 1.0),
    "colsample_bytree": (0.5, 1.0),
    "n_estimators": (100, 1000),
}

xgb_model = XGBClassifier()
bayes_search = BayesSearchCV(
    xgb_model, param_space, n_iter=50, cv=5, scoring="roc_auc", n_jobs=-1, verbose=2
)

X_small, _, y_small, _ = train_test_split(
    X_train, y_train, test_size=0.9, stratify=y_train, random_state=42
)
bayes_search.fit(X_small, y_small)


In [None]:
# hp tuning for xgboost classifier
from sklearn.metrics import roc_auc_score, classification_report, confusion_matrix

print("Best parameters found:", bayes_search.best_params_)
print("Best score achieved:(reduced data)", bayes_search.best_score_)

best_model = bayes_search.best_estimator_
best_model.fit(X_train, y_train)

# Step 4: Make predictions on the validation set
y_pred = best_model.predict(X_val)
y_prob = best_model.predict_proba(X_val)[:, 1]

# Step 5: Evaluate the model
print("XGBoost")
print(classification_report(y_val, y_pred))
print(f"AUC-ROC Score: {roc_auc_score(y_val, y_prob):.4f}")

# Step 6: Run predictions on the test set
X_test = merged_test_data.loc[:, merged_test_data.columns != "TransactionID"]
test_predictions = best_model.predict(X_test)
test_prob = best_model.predict_proba(X_test)[:, 1]

# Step 7: Prepare submission DataFrame
submission_xgb = pd.DataFrame(
    {
        "TransactionID": merged_test_data[
            "TransactionID"
        ],  # Use the 'TransactionID' column
        "isFraud": test_prob,  # Use the predictions
    }
)

# Step 8: Save predictions to a CSV file
submission_xgb.to_csv("submission_xgb_ft_2.csv", index=False)

print("Predictions saved to submission_xgb_2.csv")


In [None]:
import logging
import optuna
import os
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

# Create timestamped folder
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = f"bayesian_optimization_results_{timestamp}"
os.makedirs(output_folder, exist_ok=True)

# Set up logging
log_file = os.path.join(output_folder, "log.txt")
logging.basicConfig(
    filename=log_file,
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)
logger = logging.getLogger()


# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters
    solver = trial.suggest_categorical("solver", ["newton-cholesky", "sag", "saga"])
    penalty = trial.suggest_categorical("penalty", ["l2", "none", "l1"])
    C = trial.suggest_loguniform("C", 1e-3, 1e3)
    tol = trial.suggest_loguniform("tol", 1e-5, 1e-1)
    max_iter = trial.suggest_int("max_iter", 100, 1000)
    l1_ratio = None  # ElasticNet compatibility not handled here

    # Compatibility checks for solver and penalty
    if solver == "newton-cholesky" and penalty not in ["l2", "none"]:
        logger.warning(
            f"Trial {trial.number}: Invalid combination (solver={solver}, penalty={penalty}). Pruning trial."
        )
        raise optuna.exceptions.TrialPruned()  # Invalid combination
    if solver == "sag" and penalty != "l2":
        logger.warning(
            f"Trial {trial.number}: Invalid combination (solver={solver}, penalty={penalty}). Pruning trial."
        )
        raise optuna.exceptions.TrialPruned()  # Invalid combination

    logger.info(f"Trial {trial.number}: Starting with parameters: {trial.params}")

    # Stratified K-Fold Cross-Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_aucs = []

    for train_idx, val_idx in kf.split(X_train_resampled, y_train_resampled):
        X_train_fold, X_val_fold = (
            X_train_resampled.iloc[train_idx],
            X_train_resampled.iloc[val_idx],
        )
        y_train_fold, y_val_fold = (
            y_train_resampled.iloc[train_idx],
            y_train_resampled.iloc[val_idx],
        )

        # Train Logistic Regression
        try:
            logger.info(f"Fold {len(fold_aucs) + 1}: Training Logistic Regression...")
            model = LogisticRegression(
                solver=solver,
                penalty=penalty,
                C=C,
                tol=tol,
                max_iter=max_iter,
                l1_ratio=l1_ratio,
                random_state=42,
            )
            model.fit(X_train_fold, y_train_fold)
            y_prob = model.predict_proba(X_val_fold)[:, 1]
            fold_auc = roc_auc_score(y_val_fold, y_prob)
            fold_aucs.append(fold_auc)
            logger.info(f"Fold {len(fold_aucs)}: AUC-ROC = {fold_auc:.4f}")
        except Exception as e:
            logger.error(f"Error during training: {e}. Pruning trial.")
            raise optuna.exceptions.TrialPruned()  # Handle invalid configurations

    # Save fold AUCs for debugging
    trial.set_user_attr("fold_aucs", fold_aucs)
    trial.set_user_attr("mean_auc", np.mean(fold_aucs))
    trial.set_user_attr("params", trial.params)

    # Generate predictions on the test set and save submission
    test_probabilities = model.predict_proba(X_test)[:, 1]
    submission = submission_template.copy()
    submission["isFraud"] = (
        test_probabilities  # Assuming 'isFraud' is the positive class
    )

    submission_file = os.path.join(
        output_folder, f"submission_trial_{trial.number}.csv"
    )
    submission.to_csv(submission_file, index=False)
    logger.info(f"Trial {trial.number}: Submission saved to {submission_file}")

    mean_auc = np.mean(fold_aucs)
    logger.info(f"Trial {trial.number}: Mean AUC-ROC across folds = {mean_auc:.4f}")

    return mean_auc


# Run the optimization
logger.info("Starting Bayesian optimization with Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Save results
all_trials = []
for trial in study.trials:
    all_trials.append(
        {
            "trial_number": trial.number,
            "params": trial.params,
            "mean_auc": trial.user_attrs["mean_auc"],
            "fold_aucs": trial.user_attrs["fold_aucs"],
        }
    )

results_df = pd.DataFrame(all_trials)
results_file = os.path.join(output_folder, "all_trials_results.csv")
results_df.to_csv(results_file, index=False)
logger.info(f"All trial results saved to {results_file}")

# Print and save top 3 models
top_3 = results_df.sort_values("mean_auc", ascending=False).head(3)
logger.info("\nTop 3 Models:\n" + top_3.to_string())

top_3_file = os.path.join(output_folder, "top_3_models.csv")
top_3.to_csv(top_3_file, index=False)
logger.info(f"Top 3 models saved to {top_3_file}")

# Save each trial's parameters and fold scores
for trial in study.trials:
    trial_details = {
        "params": trial.params,
        "fold_aucs": trial.user_attrs["fold_aucs"],
        "mean_auc": trial.user_attrs["mean_auc"],
    }
    trial_file = os.path.join(output_folder, f"trial_{trial.number}.json")
    with open(trial_file, "w") as f:
        f.write(str(trial_details))
    logger.info(f"Details for Trial {trial.number} saved to {trial_file}")

# Save selected parameters and AUC scores for review
selected_params_file = os.path.join(output_folder, "selected_parameters.txt")
with open(selected_params_file, "w") as f:
    for trial in study.trials:
        f.write(
            f"Trial {trial.number}: Params: {trial.params}, Mean AUC: {trial.user_attrs['mean_auc']}, Fold AUCs: {trial.user_attrs['fold_aucs']}\n"
        )
logger.info(f"Selected parameters and AUC scores saved to {selected_params_file}")

logger.info(f"Optimization complete. Results saved in folder: {output_folder}")


In [None]:
import logging
import optuna
import os
import pandas as pd
import numpy as np
from datetime import datetime
from catboost import CatBoostClassifier
from sklearn.metrics import roc_auc_score, classification_report
from sklearn.model_selection import StratifiedKFold

# Create timestamped folder for saving results
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = f"catboost_bayesian_results_{timestamp}"
os.makedirs(output_folder, exist_ok=True)

# Set up logging
log_file = os.path.join(output_folder, "log.txt")
logging.basicConfig(
    filename=log_file,
    filemode="a",
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
)
logger = logging.getLogger()


# Define the Optuna objective function
def objective(trial):
    # Suggest hyperparameters for CatBoost
    params = {
        "learning_rate": trial.suggest_loguniform("learning_rate", 0.01, 0.3),
        "iterations": trial.suggest_int("iterations", 100, 1000),
        "depth": trial.suggest_int("depth", 4, 10),
        "l2_leaf_reg": trial.suggest_loguniform("l2_leaf_reg", 1, 10),
        "bagging_temperature": trial.suggest_uniform("bagging_temperature", 0, 2),
        "random_strength": trial.suggest_uniform("random_strength", 0, 10),
        "scale_pos_weight": trial.suggest_uniform("scale_pos_weight", 1, 10),
        "eval_metric": "AUC",
        "random_seed": 42,
        "verbose": 0,
    }
    logger.info(f"Trial {trial.number}: Starting with parameters: {params}")

    # Stratified K-Fold Cross-Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(
        kf.split(X_train_resampled, y_train_resampled), 1
    ):
        logger.info(f"  Fold {fold}: Training CatBoost...")
        X_train_fold, X_val_fold = (
            X_train_resampled.iloc[train_idx],
            X_train_resampled.iloc[val_idx],
        )
        y_train_fold, y_val_fold = (
            y_train_resampled.iloc[train_idx],
            y_train_resampled.iloc[val_idx],
        )

        # Train CatBoost Classifier
        model = CatBoostClassifier(**params)
        model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=(X_val_fold, y_val_fold),
            early_stopping_rounds=50,
        )

        # Predict probabilities for validation set
        y_prob = model.predict_proba(X_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, y_prob)
        fold_aucs.append(fold_auc)
        logger.info(f"  Fold {fold}: AUC-ROC = {fold_auc:.4f}")

    # Calculate mean AUC-ROC across folds
    mean_auc = np.mean(fold_aucs)
    logger.info(f"Trial {trial.number}: Mean AUC-ROC across folds = {mean_auc:.4f}")

    # Save predictions on test set for this trial
    test_probabilities = model.predict_proba(X_test)
    submission = submission_template.copy()
    submission["isFraud"] = test_probabilities[:, 1]
    submission_file = os.path.join(
        output_folder, f"submission_trial_{trial.number}.csv"
    )
    submission.to_csv(submission_file, index=False)
    logger.info(f"Trial {trial.number}: Submission saved to {submission_file}")

    # Store results
    trial.set_user_attr("fold_aucs", fold_aucs)
    trial.set_user_attr("mean_auc", mean_auc)
    trial.set_user_attr("params", params)

    return mean_auc


# Run Bayesian Optimization
logger.info("Starting Bayesian optimization with Optuna...")
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Save all trials results
all_trials = []
for trial in study.trials:
    all_trials.append(
        {
            "trial_number": trial.number,
            "params": trial.params,
            "mean_auc": trial.user_attrs["mean_auc"],
            "fold_aucs": trial.user_attrs["fold_aucs"],
        }
    )

results_df = pd.DataFrame(all_trials)
results_file = os.path.join(output_folder, "all_trials_results.csv")
results_df.to_csv(results_file, index=False)
logger.info(f"All trial results saved to {results_file}")

# Print and save top 3 models
top_3 = results_df.sort_values("mean_auc", ascending=False).head(3)
logger.info("\nTop 3 Models:\n" + top_3.to_string())

top_3_file = os.path.join(output_folder, "top_3_models.csv")
top_3.to_csv(top_3_file, index=False)
logger.info(f"Top 3 models saved to {top_3_file}")

# Save detailed trial logs
selected_params_file = os.path.join(output_folder, "selected_parameters.txt")
with open(selected_params_file, "w") as f:
    for trial in study.trials:
        f.write(
            f"Trial {trial.number}: Params: {trial.params}, Mean AUC: {trial.user_attrs['mean_auc']}, Fold AUCs: {trial.user_attrs['fold_aucs']}\n"
        )
logger.info(f"Selected parameters and AUC scores saved to {selected_params_file}")

logger.info(f"Optimization complete. Results saved in folder: {output_folder}")


In [None]:
import optuna
import os
import pandas as pd
import numpy as np
from datetime import datetime
from lightgbm import LGBMClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# Create timestamped folder for saving results
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
output_folder = f"lgbm_bayesian_results_{timestamp}"
os.makedirs(output_folder, exist_ok=True)


# Define the Optuna objective function
def objective(trial):
    # Define the hyperparameter search space for LightGBM
    params = {
        "learning_rate": trial.suggest_loguniform(
            "learning_rate", 0.01, 0.1
        ),  # Reduced range for faster convergence
        "n_estimators": trial.suggest_int(
            "n_estimators", 100, 1000
        ),  # Lower maximum iterations
        "max_depth": trial.suggest_int("max_depth", -1, 10),  # Limit to smaller depths
        "num_leaves": trial.suggest_int(
            "num_leaves", 20, 50
        ),  # Fewer leaves to reduce complexity
        "min_child_samples": trial.suggest_int(
            "min_child_samples", 20, 50
        ),  # Moderate range for child samples
        "subsample": trial.suggest_uniform(
            "subsample", 0.7, 1.0
        ),  # Keep subsampling high for better generalization
        "colsample_bytree": trial.suggest_uniform(
            "colsample_bytree", 0.7, 1.0
        ),  # Similar to subsample
        "reg_lambda": trial.suggest_loguniform(
            "reg_lambda", 0.1, 5.0
        ),  # Reduced range for regularization
        "reg_alpha": trial.suggest_loguniform(
            "reg_alpha", 0.1, 5.0
        ),  # Reduced range for regularization
        "scale_pos_weight": trial.suggest_uniform(
            "scale_pos_weight", 1, 5
        ),  # Smaller range for class imbalance scaling
    }

    print(f"\nTrial {trial.number}: Starting with parameters: {params}")

    # Initialize LightGBM model
    lgbm_model = LGBMClassifier(**params, random_state=42)

    # Stratified K-Fold Cross-Validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    fold_aucs = []

    for fold, (train_idx, val_idx) in enumerate(
        kf.split(X_train_resampled, y_train_resampled), 1
    ):
        print(f"  Fold {fold}: Training LGBMClassifier...")
        X_train_fold, X_val_fold = (
            X_train_resampled.iloc[train_idx],
            X_train_resampled.iloc[val_idx],
        )
        y_train_fold, y_val_fold = (
            y_train_resampled.iloc[train_idx],
            y_train_resampled.iloc[val_idx],
        )

        # Train LightGBM model
        lgbm_model.fit(
            X_train_fold,
            y_train_fold,
            eval_set=[(X_val_fold, y_val_fold)],
            eval_metric="auc",
        )

        # Predict probabilities for validation set
        y_prob = lgbm_model.predict_proba(X_val_fold)[:, 1]
        fold_auc = roc_auc_score(y_val_fold, y_prob)
        fold_aucs.append(fold_auc)
        print(f"  Fold {fold}: AUC-ROC = {fold_auc:.4f}")

    # Calculate mean AUC-ROC across folds
    mean_auc = np.mean(fold_aucs)
    print(f"Trial {trial.number}: Mean AUC-ROC across folds = {mean_auc:.4f}")

    # Save predictions on test set for this trial
    test_probabilities = lgbm_model.predict_proba(X_test)[:, 1]
    submission = submission_template.copy()
    submission["isFraud"] = test_probabilities
    submission_file = os.path.join(
        output_folder, f"submission_trial_{trial.number}.csv"
    )
    submission.to_csv(submission_file, index=False)
    print(f"Trial {trial.number}: Submission saved to {submission_file}")

    # Store results
    trial.set_user_attr("fold_aucs", fold_aucs)
    trial.set_user_attr("mean_auc", mean_auc)
    trial.set_user_attr("params", params)

    return mean_auc


# Run Bayesian Optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30)  # Adjust the number of trials as needed

# Save all trials results
all_trials = []
for trial in study.trials:
    all_trials.append(
        {
            "trial_number": trial.number,
            "params": trial.user_attrs["params"],
            "mean_auc": trial.user_attrs["mean_auc"],
            "fold_aucs": trial.user_attrs["fold_aucs"],
        }
    )

results_df = pd.DataFrame(all_trials)
results_file = os.path.join(output_folder, "all_trials_results.csv")
results_df.to_csv(results_file, index=False)

# Print and save top 3 models
top_3 = results_df.sort_values("mean_auc", ascending=False).head(3)
print("\nTop 3 Models:")
print(top_3)

top_3_file = os.path.join(output_folder, "top_3_models.csv")
top_3.to_csv(top_3_file, index=False)

# Save detailed trial logs
selected_params_file = os.path.join(output_folder, "selected_parameters.txt")
with open(selected_params_file, "w") as f:
    for trial in study.trials:
        f.write(
            f"Trial {trial.number}: Params: {trial.user_attrs['params']}, Mean AUC: {trial.user_attrs['mean_auc']}, Fold AUCs: {trial.user_attrs['fold_aucs']}\n"
        )

print(f"\nOptimization complete. Results saved in folder: {output_folder}")
