In [None]:
import pandas as pd

def loadData(transactionPath, identityPath):
    trainTransaction = pd.read_csv(transactionPath)
    trainIdentity = pd.read_csv(identityPath)
    data = pd.merge(trainTransaction, trainIdentity, on='TransactionID', how='left')
    return data


In [2]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer

def preprocessData(data):
    ###impute missing values
    imputer = SimpleImputer(strategy='mean')
    data[data.select_dtypes(include=['number']).columns] = imputer.fit_transform(data.select_dtypes(include=['number']))

    ##encode categorical variables
    encoder = LabelEncoder()
    for column in data.select_dtypes(include=['object']).columns:
        data[column] = encoder.fit_transform(data[column].astype(str))

    ##scale numerical features
    scaler = StandardScaler()
    data[data.select_dtypes(include=['number']).columns] = scaler.fit_transform(data.select_dtypes(include=['number']))
    return data


In [3]:
from sklearn.decomposition import PCA

def featureEngineering(data):
    pca = PCA(n_components=0.95)  #rreduce dimensions to retain 95% variance
    features = pca.fitTransform(data)
    return features

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

def train_models(X, y):
    xTrain, xVal, yTrain, yVal = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    model.fit(xTrain, yTrain)
    return model, xTrain, xVal, yTrain, yVal


In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

def evaluate_model(model, XVal, yVal):
    yPred = model.predict(XVal)
    yProb = model.predictProba(XVal)[:, 1] if hasattr(model, 'predictProba') else None
    metrics = {
        "Accuracy": accuracy_score(yVal, yPred),
        "Precision": precision_score(yVal, yPred, zero_division=0),
        "Recall": recall_score(yVal, yPred),
        "F1 Score": f1_score(yVal, yPred),
        "ROC-AUC": roc_auc_score(yVal, yProb) if yProb is not None else None
    }
    return metrics


In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.decomposition import PCA

#ffunction to load and preprocess data
def loadData(transactionPath, identityPath):
    trainTransaction = pd.read_csv(transactionPath)
    trainIdentity = pd.read_csv(identityPath)
    data = pd.merge(pd.read_csv(transactionPath), trainIdentity, on='TransactionID', how='left')
    data = data.sample(frac=0.1, random_state=42)  #reduce data size for faster processing

    imputer = SimpleImputer(strategy='mean')
    encoder = LabelEncoder()

    for column in data.columns:
        if data[column].dtype == 'object':
            data[column] = encoder.fit_transform(data[column].astype(str))
        data[column] = imputer.fit_transform(data[column].values.reshape(-1, 1))

    return data

# Main function to execute the modeling process
def main():
    transactionPath = '/content/train_transaction.csv'
    identityPath = '/content/train_identity.csv'
    data = loadData(transactionPath, identityPath)

    scaler = StandardScaler()
    X = scaler.fit_transform(data.drop(['isFraud', 'TransactionID'], axis=1))
    y = data['isFraud'].values

    pca = PCA(n_components=0.95)  # Reduce dimensions
    X = pca.fit_transform(X)

    xTrain, xVal, yTrain, yVal = train_test_split(X, y, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Gradient Boosting": GradientBoostingClassifier(n_estimators=50),
        "SVM": SVC(kernel='rbf', probability=True),
        "Random Forest": RandomForestClassifier(n_estimators=50, random_state=42, n_jobs=-1)
    }

    results = {}
    for name, model in models.items():
        model.fit(xTrain, yTrain)
        yPred = model.predict(xVal)
        results[name] = {
            "Accuracy": accuracy_score(yVal, yPred),
            "Precision": precision_score(yVal, yPred),
            "Recall": recall_score(yVal, yPred),
            "F1 Score": f1_score(yVal, yPred),
            "ROC-AUC": roc_auc_score(yVal, model.predict_proba(xVal)[:, 1])
        }

    print("Model\t\t\tAccuracy\tPrecision\tRecall\t\tF1-Score\tROC-AUC")
    for name, metrics in results.items():
        print(f"{name}\t\t{metrics['Accuracy']:.2%}\t\t{metrics['Precision']:.2%}\t\t{metrics['Recall']:.2%}\t\t{metrics['F1 Score']:.2%}\t\t{metrics['ROC-AUC']:.2%}")

if __name__ == "__main__":
    main()


FileNotFoundError: [Errno 2] No such file or directory: '/content/train_transaction.csv'