In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")


Accuracy: 0.9224
Precision: 0.6229
Recall: 0.4498
False Positive Rate: 0.0283
True Negative Rate: 0.9717
False Negative Rate: 0.5502
ROC AUC Score: 0.7107
Cost: 177380.0000
F-measure: 0.5224
Balanced Classification Rate: 1.4215
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6611
Weighted Accuracy: 0.7107


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the KNN classifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Calculate and print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.9224
Precision: 0.6229
Recall: 0.4498
False Positive Rate: 0.0283
True Negative Rate: 0.9717
False Negative Rate: 0.5502
ROC AUC Score: 0.7107
Cost: 177380.0000
F-measure: 0.5224
Balanced Classification Rate: 1.4215
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6611
Weighted Accuracy: 0.7107

Confusion Matrix:
[[26597   776]
 [ 1568  1282]]


AdaBoost

In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the AdaBoost classifier
adaboost = AdaBoostClassifier()
adaboost.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adaboost.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")




Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000


  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the AdaBoost classifier
adaboost = AdaBoostClassifier(n_estimators=50, random_state=42)
adaboost.fit(X_train, y_train)

# Make predictions on the test set
y_pred = adaboost.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Calculate and print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)




Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000

Confusion Matrix:
[[27373     0]
 [ 2850     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))


Random Forest

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")


Accuracy: 0.9204
Precision: 0.5959
Recall: 0.4839
False Positive Rate: 0.0342
True Negative Rate: 0.9658
False Negative Rate: 0.5161
ROC AUC Score: 0.7249
Cost: 170240.0000
F-measure: 0.5341
Balanced Classification Rate: 1.4497
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6836
Weighted Accuracy: 0.7249


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Random Forest classifier
random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

# Make predictions on the test set
y_pred = random_forest.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)  # Confusion matrix calculation
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.9209
Precision: 0.5996
Recall: 0.4839
False Positive Rate: 0.0336
True Negative Rate: 0.9664
False Negative Rate: 0.5161
ROC AUC Score: 0.7251
Cost: 170100.0000
F-measure: 0.5355
Balanced Classification Rate: 1.4502
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6838
Weighted Accuracy: 0.7251

Confusion Matrix:
[[26452   921]
 [ 1471  1379]]


Decision Tree

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")


Accuracy: 0.9144
Precision: 0.5537
Recall: 0.4737
False Positive Rate: 0.0397
True Negative Rate: 0.9603
False Negative Rate: 0.5263
ROC AUC Score: 0.7170
Cost: 174380.0000
F-measure: 0.5106
Balanced Classification Rate: 1.4339
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6744
Weighted Accuracy: 0.7170


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Decision Tree classifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

# Make predictions on the test set
y_pred = decision_tree.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)  # Confusion matrix calculation
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.9145
Precision: 0.5548
Recall: 0.4740
False Positive Rate: 0.0396
True Negative Rate: 0.9604
False Negative Rate: 0.5260
ROC AUC Score: 0.7172
Cost: 174250.0000
F-measure: 0.5113
Balanced Classification Rate: 1.4344
Mathews Correlation Coefficient: 0.0000
Geometric Mean: 0.6747
Weighted Accuracy: 0.7172

Confusion Matrix:
[[26289  1084]
 [ 1499  1351]]


Logistic Regression 

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression classifier
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")


Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000


  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the Logistic Regression classifier
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Make predictions on the test set
y_pred = log_reg.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)  # Confusion matrix calculation
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000

Confusion Matrix:
[[27373     0]
 [ 2850     0]]


  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))


SVM

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the SVM classifier
svm = SVC()
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")


Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000


  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, roc_auc_score

# Load the dataset
file_path = 'Fraud_Data.csv'
df = pd.read_csv(file_path)

# Data preprocessing
# Assuming 'user_id', 'device_id', and 'ip_address' are not relevant for prediction
# Dropping these columns and any other non-numeric columns
df = df.drop(['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'], axis=1)

# Encode categorical variables (source, browser, sex)
df = pd.get_dummies(df, columns=['source', 'browser', 'sex'])

# Split the data into features and target
X = df.drop('class', axis=1)
y = df['class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train the SVM classifier
svm = SVC()
svm.fit(X_train, y_train)

# Make predictions on the test set
y_pred = svm.predict(X_test)

# Calculate performance measures
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, zero_division=0)  # Set zero_division=0 to avoid Precision NaN warning
recall = recall_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)  # Confusion matrix calculation
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

# Calculate additional measures
tn, fp, fn, tp = conf_matrix.ravel()
false_positive_rate = fp / (fp + tn)
true_negative_rate = tn / (tn + fp)
false_negative_rate = fn / (fn + tp)
cost = 100 * fn + 10 * (fp + tp)
f_measure = 2 * (precision * recall) / (precision + recall)
balanced_classification_rate = 1 * (tp / (tp + fn) + tn / (tn + fp))
mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
geometric_mean = (true_negative_rate * recall) ** 0.5

# Define weight for weighted accuracy calculation
w = 0.5  # Adjust the weight as needed

# Calculate weighted accuracy
weighted_accuracy = w * recall + (1 - w) * true_negative_rate

# Print the performance measures
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"False Positive Rate: {false_positive_rate:.4f}")
print(f"True Negative Rate: {true_negative_rate:.4f}")
print(f"False Negative Rate: {false_negative_rate:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print(f"Cost: {cost:.4f}")
print(f"F-measure: {f_measure:.4f}")
print(f"Balanced Classification Rate: {balanced_classification_rate:.4f}")
print(f"Mathews Correlation Coefficient: {mathews_corr_coef:.4f}")
print(f"Geometric Mean: {geometric_mean:.4f}")
print(f"Weighted Accuracy: {weighted_accuracy:.4f}")

# Print the confusion matrix
print("\nConfusion Matrix:")
print(conf_matrix)


Accuracy: 0.9057
Precision: 0.0000
Recall: 0.0000
False Positive Rate: 0.0000
True Negative Rate: 1.0000
False Negative Rate: 1.0000
ROC AUC Score: 0.5000
Cost: 285000.0000
F-measure: nan
Balanced Classification Rate: 1.0000
Mathews Correlation Coefficient: nan
Geometric Mean: 0.0000
Weighted Accuracy: 0.5000

Confusion Matrix:
[[27373     0]
 [ 2850     0]]


  f_measure = 2 * (precision * recall) / (precision + recall)
  mathews_corr_coef = (tp * tn - fp * fn) / ((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
