In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pandas as pd

data = load_breast_cancer(return_X_y=False, as_frame=True)
#print(data.feature_names)
df = data.frame
features = ['mean radius', 'mean texture', 'mean perimeter', 'mean area',
 'mean smoothness', 'mean compactness', 'mean concavity',
 'mean concave points', 'mean symmetry', 'mean fractal dimension']
X = df[features]
y = df.target

# test 1, only features and no scaling

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=28)

# build the model
model = LogisticRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
#print(y_prob)

# set a custom threshold
threshold = 0.4
y_pred_custom_thresh = (y_prob >= threshold).astype(int)
# print(y_pred)
# print(y_pred_custom_thresh)
#print(list(y_test))

cmatrix1 = confusion_matrix(y_test,y_pred_custom_thresh)
print("Confusion matrix 1:")
print(cmatrix1)
print('accuracy score:', + (accuracy_score(y_test,y_pred_custom_thresh)))
print('f1 score:', + (f1_score(y_test,y_pred_custom_thresh)))
print()

# now try to normalize the values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y,test_size=0.2, random_state=28)

# build the model with normalized data
model2 = LogisticRegression()
model2.fit(X_train2, y_train2)

y_pred2 = model2.predict(X_test2)
y_prob2 = model2.predict_proba(X_test2)[:,1]

y_pred2_custom_thresh = (y_prob2 >= threshold).astype(int)

cmatrix2 = confusion_matrix(y_test2,y_pred2_custom_thresh)
print("Confusion matrix 2:")
print(cmatrix2)
print('accuracy score:', + (accuracy_score(y_test2,y_pred2_custom_thresh)))
print('f1 score:', + (f1_score(y_test2,y_pred2_custom_thresh)))

In [None]:
# so apparently I can just treat the errors as features too

data = load_breast_cancer(return_X_y=False, as_frame=True)

df = data.frame

X = df.iloc[:, :-1]
y = df.target

# test 1, only features and no scaling

# split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=28)

# build the model
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test)[:,1]
#print(y_prob)

# set a custom threshold
threshold = 0.35
y_pred_custom_thresh = (y_prob >= threshold).astype(int)
# print(y_pred)
# print(y_pred_custom_thresh)
#print(list(y_test))

cmatrix1 = confusion_matrix(y_test,y_pred_custom_thresh)
print("Confusion matrix 1:")
print(cmatrix1)
print('accuracy score:', + (accuracy_score(y_test,y_pred_custom_thresh)))
print('f1 score:', + (f1_score(y_test,y_pred_custom_thresh)))
print()

# now try to normalize the values
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

X_train2, X_test2, y_train2, y_test2 = train_test_split(X,y,test_size=0.2, random_state=28)

# build the model with normalized data
model2 = LogisticRegression(max_iter=10000)
model2.fit(X_train2, y_train2)

y_pred2 = model2.predict(X_test2)
y_prob2 = model2.predict_proba(X_test2)[:,1]

y_pred2_custom_thresh = (y_prob2 >= threshold).astype(int)

cmatrix2 = confusion_matrix(y_test2,y_pred2_custom_thresh)
print("Confusion matrix 2:")
print(cmatrix2)
print('accuracy score:', + (accuracy_score(y_test2,y_pred2_custom_thresh)))
print('f1 score:', + (f1_score(y_test2,y_pred2_custom_thresh)))

In [None]:
print((25+36)/(25+36+9))

In [None]:
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
transactions = pd.read_csv('transactions_modified.csv')
print(transactions.head())
print(transactions.info())

# Summary statistics on amount column
transactions['amount'].describe()

# Create isPayment field
transactions['isPayment'] = 0
transactions['isPayment'][transactions['type'].isin(['PAYMENT','DEBIT'])] = 1

# Create isMovement field
transactions['isMovement'] = 0
transactions['isMovement'][transactions['type'].isin(['CASH_OUT', 'TRANSFER'])] = 1

# Create accountDiff field
transactions['accountDiff'] = abs(transactions['oldbalanceDest'] - transactions['oldbalanceOrg'])

# Create features and label variables
features = transactions[['amount','isPayment','isMovement','accountDiff']]
label = transactions['isFraud']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    label, 
                                                    test_size=0.3,random_state = 30)

# Normalize the features variables
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit the model to the training data
model = LogisticRegression()
model.fit(X_train, y_train)

# Score the model on the training data
print(model.score(X_train, y_train))

# Score the model on the test data
print(model.score(X_test, y_test))

# Print the model coefficients
print(model.coef_)

# # New transaction data
# transaction1 = np.array([123456.78, 0.0, 1.0, 54670.1])
# transaction2 = np.array([98765.43, 1.0, 0.0, 8524.75])
# transaction3 = np.array([543678.31, 1.0, 0.0, 510025.5])

# # Create a new transaction
# your_transaction = np.array([6472.54, 1.0, 0.0, 55901.23])

# # Combine new transactions into a single array
# sample_transactions = np.stack((transaction1,transaction2,transaction3,your_transaction))

# # Normalize the new transactions
# sample_transactions = scaler.transform(sample_transactions)

# # Predict fraud on the new transactions
# print(model.predict(sample_transactions))

# # Show probabilities on the new transactions
# print(model.predict_proba(sample_transactions))