In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline


In [2]:
df=pd.read_csv('fraudTrain.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [4]:
df.shape

(1296675, 23)

In [5]:
df.value_counts('is_fraud')

is_fraud
0    1289169
1       7506
Name: count, dtype: int64

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,...,lat,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud
0,0,2019-01-01 00:00:18,2703186189652095,"fraud_Rippin, Kub and Mann",misc_net,4.97,Jennifer,Banks,F,561 Perry Cove,...,36.0788,-81.1781,3495,"Psychologist, counselling",1988-03-09,0b242abb623afc578575680df30655b9,1325376018,36.011293,-82.048315,0
1,1,2019-01-01 00:00:44,630423337322,"fraud_Heller, Gutmann and Zieme",grocery_pos,107.23,Stephanie,Gill,F,43039 Riley Greens Suite 393,...,48.8878,-118.2105,149,Special educational needs teacher,1978-06-21,1f76529f8574734946361c461b024d99,1325376044,49.159047,-118.186462,0
2,2,2019-01-01 00:00:51,38859492057661,fraud_Lind-Buckridge,entertainment,220.11,Edward,Sanchez,M,594 White Dale Suite 530,...,42.1808,-112.262,4154,Nature conservation officer,1962-01-19,a1a22d70485983eac12b5b88dad1cf95,1325376051,43.150704,-112.154481,0
3,3,2019-01-01 00:01:16,3534093764340240,"fraud_Kutch, Hermiston and Farrell",gas_transport,45.0,Jeremy,White,M,9443 Cynthia Court Apt. 038,...,46.2306,-112.1138,1939,Patent attorney,1967-01-12,6b849c168bdad6f867558c3793159a81,1325376076,47.034331,-112.561071,0
4,4,2019-01-01 00:03:06,375534208663984,fraud_Keeling-Crist,misc_pos,41.96,Tyler,Garcia,M,408 Bradley Rest,...,38.4207,-79.4629,99,Dance movement psychotherapist,1986-03-28,a41d7549acf90789359a9aa5346dcb46,1325376186,38.674999,-78.632459,0


In [7]:
print(df.columns.tolist())

['Unnamed: 0', 'trans_date_trans_time', 'cc_num', 'merchant', 'category', 'amt', 'first', 'last', 'gender', 'street', 'city', 'state', 'zip', 'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'unix_time', 'merch_lat', 'merch_long', 'is_fraud']


In [8]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 23 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   Unnamed: 0             1296675 non-null  int64  
 1   trans_date_trans_time  1296675 non-null  object 
 2   cc_num                 1296675 non-null  int64  
 3   merchant               1296675 non-null  object 
 4   category               1296675 non-null  object 
 5   amt                    1296675 non-null  float64
 6   first                  1296675 non-null  object 
 7   last                   1296675 non-null  object 
 8   gender                 1296675 non-null  object 
 9   street                 1296675 non-null  object 
 10  city                   1296675 non-null  object 
 11  state                  1296675 non-null  object 
 12  zip                    1296675 non-null  int64  
 13  lat                    1296675 non-null  float64
 14  long              

In [9]:
# Distance between merchant & customer
df['distance_miles'] = np.sqrt((df['lat'] - df['merch_lat'])**2 + 
                               (df['long'] - df['merch_long'])**2)

# Transaction hour (fraud spikes at night)
df['hour'] = pd.to_datetime(df['trans_date_trans_time']).dt.hour

# Customer age
df['age'] = 2025 - pd.to_datetime(df['dob']).dt.year

In [10]:
df = df.drop(columns=['Unnamed: 0', 'first', 'last', 'trans_num', 'cc_num', 'street'])

In [11]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1296675 entries, 0 to 1296674
Data columns (total 20 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   trans_date_trans_time  1296675 non-null  object 
 1   merchant               1296675 non-null  object 
 2   category               1296675 non-null  object 
 3   amt                    1296675 non-null  float64
 4   gender                 1296675 non-null  object 
 5   city                   1296675 non-null  object 
 6   state                  1296675 non-null  object 
 7   zip                    1296675 non-null  int64  
 8   lat                    1296675 non-null  float64
 9   long                   1296675 non-null  float64
 10  city_pop               1296675 non-null  int64  
 11  job                    1296675 non-null  object 
 12  dob                    1296675 non-null  object 
 13  unix_time              1296675 non-null  int64  
 14  merch_lat         

In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier

# y = df["is_fraud"]
# X = df.drop(columns=["is_fraud", "Unnamed: 0"], errors="ignore")

# # Identify categorical & numeric columns
# cat_cols = X.select_dtypes(include=["object"]).columns
# num_cols = X.select_dtypes(include=["int64", "float64"]).columns

# # Preprocessing (One-Hot + Standard Scaling)
# preprocess = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), num_cols),
#         ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
#     ]
# )

# # Split data
# X_train, X_test, y_train, y_test = train_test_split(
#     X, y, test_size=0.2, random_state=42, stratify=y
# )

# # Define models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     "Decision Tree": DecisionTreeClassifier(max_depth=10),
#     "XGBoost": XGBClassifier(
#         n_estimators=200,
#         max_depth=6,
#         learning_rate=0.1,
#         objective="binary:logistic",
#         eval_metric="logloss",
#         n_jobs=-1,
#         random_state=42)
# }

# # Train & evaluate
# for name, model in models.items():
#     clf = Pipeline(steps=[("preprocess", preprocess), ("model", model)])
#     clf.fit(X_train, y_train)
#     y_pred = clf.predict(X_test)
#     y_prob = clf.predict_proba(X_test)[:, 1]
    
#     print(f"\n==== {name} ====")
#     print(classification_report(y_test, y_pred, digits=4))
#     print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))

In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.tree import DecisionTreeClassifier
# from xgboost import XGBClassifier
# # âœ… Load training data
# train_df = pd.read_csv("fraudTrain.csv", low_memory=False)

# # âœ… Load separate test data
# test_df = pd.read_csv("fraudTest.csv", low_memory=False)

# # âœ… Target column
# y_train = train_df["is_fraud"]
# X_train = train_df.drop(columns=["is_fraud", "Unnamed: 0"], errors="ignore")

# y_test = test_df["is_fraud"]
# X_test = test_df.drop(columns=["is_fraud", "Unnamed: 0"], errors="ignore")

# # âœ… Identify column types
# cat_cols = X_train.select_dtypes(include=["object"]).columns
# num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# # âœ… Preprocessing pipeline
# preprocess = ColumnTransformer(
#     transformers=[
#         ("num", StandardScaler(), num_cols),
#         ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
#     ]
# )

# # âœ… Define models
# models = {
#     "Logistic Regression": LogisticRegression(max_iter=1000),
#     "Decision Tree": DecisionTreeClassifier(max_depth=10),
#     "XGBoost": XGBClassifier(
#         n_estimators=200,
#         learning_rate=0.1,
#         max_depth=6,
#         objective="binary:logistic",
#         eval_metric="logloss",
#         n_jobs=-1)
# }

# # âœ… Train & Evaluate on separate test data
# for name, model in models.items():
#     clf = Pipeline([("preprocess", preprocess), ("model", model)])
#     clf.fit(X_train, y_train)
    
#     y_pred = clf.predict(X_test)
#     y_prob = clf.predict_proba(X_test)[:, 1]
    
#     print(f"\n==== {name} ====")
#     print(classification_report(y_test, y_pred, digits=4))
#     print("ROC-AUC:", round(roc_auc_score(y_test, y_prob), 4))


==== Logistic Regression ====
              precision    recall  f1-score   support

           0     0.9961    0.9995    0.9978    553574
           1     0.0000    0.0000    0.0000      2145

    accuracy                         0.9957    555719
   macro avg     0.4981    0.4998    0.4989    555719
weighted avg     0.9923    0.9957    0.9940    555719

ROC-AUC: 0.6053

==== Decision Tree ====
              precision    recall  f1-score   support

           0     0.9983    0.9990    0.9986    553574
           1     0.6775    0.5562    0.6109      2145

    accuracy                         0.9973    555719
   macro avg     0.8379    0.7776    0.8047    555719
weighted avg     0.9970    0.9973    0.9971    555719

ROC-AUC: 0.8503

==== XGBoost ====
              precision    recall  f1-score   support

           0     0.9981    0.9987    0.9984    553574
           1     0.5984    0.5105    0.5509      2145

    accuracy                         0.9968    555719
   macro avg     0.79

In [14]:
# import pandas as pd
# import numpy as np
# from sklearn.metrics import classification_report, roc_auc_score
# from sklearn.preprocessing import LabelEncoder

# # Load data
# train_df = pd.read_csv("fraudTrain.csv", low_memory=False)
# test_df  = pd.read_csv("fraudTest.csv", low_memory=False)

# # Drop unwanted index column if exists
# for df in [train_df, test_df]:
#     df.drop(columns=['Unnamed: 0'], errors='ignore', inplace=True)

# # Target
# y_train = train_df["is_fraud"]
# X_train = train_df.drop("is_fraud", axis=1)

# y_test = test_df["is_fraud"]
# X_test = test_df.drop("is_fraud", axis=1)

# # Identify categorical columns
# cat_cols = X_train.select_dtypes(include=["object"]).columns.tolist()

# # Label Encode categoricals for both models
# for col in cat_cols:
#     le = LabelEncoder()
#     full_data = pd.concat([X_train[col], X_test[col]], axis=0)  # fit on total
#     le.fit(full_data.astype(str))
#     X_train[col] = le.transform(X_train[col].astype(str))
#     X_test[col] = le.transform(X_test[col].astype(str))

# # Convert to numpy
# X_train_np, y_train_np = X_train.values, y_train.values
# X_test_np, y_test_np = X_test.values, y_test.values

# print("Categorical Columns:", cat_cols)


In [15]:
# from catboost import CatBoostClassifier

# cat_model = CatBoostClassifier(
#     depth=8,
#     learning_rate=0.1,
#     iterations=400,
#     eval_metric="AUC",
#     loss_function="Logloss",
#     verbose=100
# )

# cat_model.fit(X_train, y_train, cat_features=cat_cols)

# # Prediction
# cat_pred = cat_model.predict(X_test)
# cat_proba = cat_model.predict_proba(X_test)[:, 1]

# print("\nðŸ“Œ CatBoost Results\n")
# print(classification_report(y_test, cat_pred, digits=4))
# print("ROC-AUC:", round(roc_auc_score(y_test, cat_proba), 4))


In [None]:
# test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555719 entries, 0 to 555718
Data columns (total 23 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   Unnamed: 0             555719 non-null  int64  
 1   trans_date_trans_time  555719 non-null  object 
 2   cc_num                 555719 non-null  int64  
 3   merchant               555719 non-null  object 
 4   category               555719 non-null  object 
 5   amt                    555719 non-null  float64
 6   first                  555719 non-null  object 
 7   last                   555719 non-null  object 
 8   gender                 555719 non-null  object 
 9   street                 555719 non-null  object 
 10  city                   555719 non-null  object 
 11  state                  555719 non-null  object 
 12  zip                    555719 non-null  int64  
 13  lat                    555719 non-null  float64
 14  long                   555719 non-nu

In [1]:
# =============================================
# âœ… 1. Import Libraries
# =============================================
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

# =============================================
# âœ… 2. Load Data
# =============================================
train_df = pd.read_csv("fraudTrain.csv", low_memory=False)
test_df = pd.read_csv("fraudTest.csv", low_memory=False)

# =============================================
# âœ… 3. Separate Features and Target
# =============================================
y_train = train_df["is_fraud"]
X_train = train_df.drop(columns=["is_fraud", "Unnamed: 0"], errors="ignore")

y_test = test_df["is_fraud"]
X_test = test_df.drop(columns=["is_fraud", "Unnamed: 0"], errors="ignore")

# =============================================
# âœ… 4. Identify Numerical and Categorical Columns
# =============================================
cat_cols = X_train.select_dtypes(include=["object"]).columns
num_cols = X_train.select_dtypes(include=["int64", "float64"]).columns

# =============================================
# âœ… 5. Create Preprocessing Pipeline
# =============================================
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), num_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
    ]
)

# =============================================
# âœ… 6. Create Decision Tree Pipeline
# =============================================
dt_model = DecisionTreeClassifier(max_depth=10, random_state=42)

pipeline = Pipeline([
    ("preprocess", preprocess),
    ("model", dt_model)
])

# =============================================
# âœ… 7. Train the Model
# =============================================
pipeline.fit(X_train, y_train)

# =============================================
# âœ… 8. Save the Full Pipeline
# =============================================
joblib.dump(pipeline, "fraud_model.pkl")

print("âœ… Decision Tree model (with preprocessing) saved as fraud_model.pkl")


âœ… Decision Tree model (with preprocessing) saved as fraud_model.pkl


In [None]:
# import pandas as pd
# import joblib
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder, StandardScaler
# from sklearn.pipeline import Pipeline
# from sklearn.tree import DecisionTreeClassifier

# # âœ… Load data
# train_df = pd.read_csv("fraudTrain.csv", low_memory=False)
# test_df = pd.read_csv("fraudTest.csv", low_memory=False)

# # âœ… Use only selected columns
# selected_cols = ['first', 'last', 'gender', 'street', 'city', 'state', 'zip', 
#                  'lat', 'long', 'city_pop', 'job', 'dob']

# X_train = train_df[selected_cols]
# y_train = train_df["is_fraud"]

# X_test = test_df[selected_cols]
# y_test = test_df["is_fraud"]

# # âœ… Define column types
# cat_cols = ['first', 'last', 'gender', 'street', 'city', 'state', 'job', 'dob']
# num_cols = ['zip', 'lat', 'long', 'city_pop']

# # âœ… Create preprocessing pipeline
# preprocess = ColumnTransformer([
#     ("num", StandardScaler(), num_cols),
#     ("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols)
# ])

# # âœ… Build Decision Tree pipeline
# dt_pipeline = Pipeline([
#     ("preprocess", preprocess),
#     ("model", DecisionTreeClassifier(max_depth=10, random_state=42))
# ])

# # âœ… Train model
# dt_pipeline.fit(X_train, y_train)

# # âœ… Save model
# joblib.dump(dt_pipeline, "fraud_model.pkl")
# print("âœ… Simplified Decision Tree model saved successfully!")


âœ… Simplified Decision Tree model saved successfully!
