In [None]:
import pandas as pd
import numpy as np
import joblib
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Download stopwords (only needed once)
nltk.download('stopwords')
from nltk.corpus import stopwords

# File paths
cdr_file_path = 'data/CDR-Call-Details.csv'
model_path = 'cdr_fraud_model.pkl'

# Load CDR data
cdr_df = pd.read_csv(cdr_file_path)

# Drop unnecessary columns (keep useful features)
cdr_df = cdr_df[['Day Mins', 'Day Calls', 'Eve Mins', 'Eve Calls', 'Night Mins', 'Night Calls', 
                 'Intl Mins', 'Intl Calls', 'CustServ Calls', 'isFraud']]

# Split dataset
X_cdr = cdr_df.drop(columns=['isFraud'])
y_cdr = cdr_df['isFraud']

# Train-test split
X_train_cdr, X_test_cdr, y_train_cdr, y_test_cdr = train_test_split(X_cdr, y_cdr, test_size=0.2, random_state=42)

# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_cdr, y_train_cdr)

# Save the trained model
joblib.dump(rf_model, model_path)

# Evaluate the model
y_pred_cdr = rf_model.predict(X_test_cdr)
print('CDR Fraud Model Accuracy:', accuracy_score(y_test_cdr, y_pred_cdr))
print(classification_report(y_test_cdr, y_pred_cdr))
