In [58]:
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import OneHotEncoder

In [59]:
# Load the datasets
fraud_data = pd.read_csv('D:/Week8$9/data/Fraud_Data.csv')
credit_card_data = pd.read_csv('D:/Week8$9/data/creditcard.csv')

# Separate features and target for Fraud Data
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Separate features and target for Credit Card Data
X_credit_card = credit_card_data.drop(columns=['Class'])
y_credit_card = credit_card_data['Class']

# Split the Fraud Data
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Split the Credit Card Data
X_credit_card_train, X_credit_card_test, y_credit_card_train, y_credit_card_test = train_test_split(
    X_credit_card, y_credit_card, test_size=0.2, random_state=42, stratify=y_credit_card
)

In [60]:
def train_and_evaluate_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")
    print(f"\nClassification Report:\n{classification_report(y_test, y_pred)}")

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Load the dataset
fraud_data = pd.read_csv('D:/Week8$9/data/Fraud_Data.csv')

# Convert 'purchase_time' and any other datetime columns to datetime objects
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Extract features and target
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Convert datetime columns to timestamps if applicable
for col in X_fraud.select_dtypes(include=['datetime']):
    X_fraud[col] = X_fraud[col].astype('int64') // 10**9  # Convert to seconds since epoch

# Identify and encode categorical columns
categorical_cols = X_fraud.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    # Use sparse_output instead of sparse
    encoder = OneHotEncoder(sparse_output=True, max_categories=10)  # Limit to top 10 categories if possible
    X_encoded = encoder.fit_transform(X_fraud[categorical_cols])
    
    # Concatenate the sparse matrix with numeric features
    X_fraud_numeric = X_fraud.drop(columns=categorical_cols).reset_index(drop=True)
    X_fraud = pd.concat([X_fraud_numeric, pd.DataFrame.sparse.from_spmatrix(X_encoded)], axis=1)

# Convert column names to strings
X_fraud.columns = X_fraud.columns.astype(str)

# Train-test split with stratification
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Logistic Regression with class weights
logistic_model = LogisticRegression(max_iter=1000, class_weight='balanced')
logistic_model.fit(X_fraud_train, y_fraud_train)

# Predictions
y_pred = logistic_model.predict(X_fraud_test)

# Print classification report with zero_division parameter
print(classification_report(y_fraud_test, y_pred, zero_division=0))



              precision    recall  f1-score   support

           0       0.91      0.60      0.72     27393
           1       0.10      0.42      0.16      2830

    accuracy                           0.58     30223
   macro avg       0.50      0.51      0.44     30223
weighted avg       0.83      0.58      0.67     30223





In [63]:
from imblearn.over_sampling import SMOTE
from sklearn.pipeline import make_pipeline

# Create a pipeline with SMOTE and Logistic Regression
smote = SMOTE(random_state=42)
logistic_model = LogisticRegression(max_iter=1000, class_weight='balanced')

# Fit the model with SMOTE
X_fraud_train_resampled, y_fraud_train_resampled = smote.fit_resample(X_fraud_train, y_fraud_train)

logistic_model.fit(X_fraud_train_resampled, y_fraud_train_resampled)

# Predictions
y_pred = logistic_model.predict(X_fraud_test)

# Print classification report
print(classification_report(y_fraud_test, y_pred, zero_division=0))



              precision    recall  f1-score   support

           0       0.91      0.57      0.70     27393
           1       0.10      0.44      0.16      2830

    accuracy                           0.56     30223
   macro avg       0.50      0.50      0.43     30223
weighted avg       0.83      0.56      0.65     30223





In [64]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE

# Load the dataset
fraud_data = pd.read_csv('D:/Week8$9/data/Fraud_Data.csv')
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'])

# Extract features and target
X_fraud = fraud_data.drop(columns=['class'])
y_fraud = fraud_data['class']

# Convert datetime columns to timestamps if applicable
for col in X_fraud.select_dtypes(include=['datetime']):
    X_fraud[col] = X_fraud[col].astype('int64') // 10**9

# Identify and encode categorical columns
categorical_cols = X_fraud.select_dtypes(include=['object']).columns.tolist()
if categorical_cols:
    # Use sparse_output to keep the output sparse
    encoder = OneHotEncoder(sparse_output=True, max_categories=10)
    X_encoded = encoder.fit_transform(X_fraud[categorical_cols])
    
    # Concatenate the sparse matrix with numeric features
    X_fraud_numeric = X_fraud.drop(columns=categorical_cols).reset_index(drop=True)
    X_fraud = pd.concat([X_fraud_numeric, pd.DataFrame.sparse.from_spmatrix(X_encoded)], axis=1)

# Convert column names to strings
X_fraud.columns = X_fraud.columns.astype(str)

# Train-test split with stratification
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Use SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_fraud_train_resampled, y_fraud_train_resampled = smote.fit_resample(X_fraud_train, y_fraud_train)

# Logistic Regression with class weights
logistic_model = LogisticRegression(max_iter=1000, class_weight='balanced')
logistic_model.fit(X_fraud_train_resampled, y_fraud_train_resampled)

# Predictions
y_pred = logistic_model.predict(X_fraud_test)

# Print classification report
from sklearn.metrics import classification_report
print(classification_report(y_fraud_test, y_pred, zero_division=0))



              precision    recall  f1-score   support

           0       0.91      0.57      0.70     27393
           1       0.10      0.44      0.16      2830

    accuracy                           0.56     30223
   macro avg       0.50      0.50      0.43     30223
weighted avg       0.83      0.56      0.65     30223





In [65]:
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming X_fraud and y_fraud are defined as before
X_fraud_train, X_fraud_test, y_fraud_train, y_fraud_test = train_test_split(
    X_fraud, y_fraud, test_size=0.2, random_state=42, stratify=y_fraud
)

# Use SMOTE for handling class imbalance
smote = SMOTE(random_state=42)
X_fraud_train_resampled, y_fraud_train_resampled = smote.fit_resample(X_fraud_train, y_fraud_train)

# Random Forest Classifier with class weights
rf_model = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_model.fit(X_fraud_train_resampled, y_fraud_train_resampled)

# Predictions
y_pred = rf_model.predict(X_fraud_test)

# Print classification report
print(classification_report(y_fraud_test, y_pred, zero_division=0))



              precision    recall  f1-score   support

           0       0.95      1.00      0.97     27393
           1       0.94      0.53      0.68      2830

    accuracy                           0.95     30223
   macro avg       0.95      0.76      0.82     30223
weighted avg       0.95      0.95      0.95     30223

