## **Step:1 Import Libraries**

In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

# **Step 2: Import and print the dataset**

In [58]:
df=pd.read_csv("/content/credit card fraud dataset.csv")
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,1/1/2019 0:00,"Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,6/21/1978,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0
1,1/1/2019 0:00,Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.262,4154,Nature conservation officer,1/19/1962,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0
2,1/1/2019 0:07,Kiehn Inc,grocery_pos,96.29,Grenada,CA,41.6125,-122.5258,589,Systems analyst,12/21/1945,413636e759663f264aae1819a4d4f231,41.65752,-122.230347,0
3,1/1/2019 0:09,Beier-Hyatt,shopping_pos,7.77,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,8/30/1967,8a6293af5ed278dea14448ded2685fea,32.863258,-106.520205,0
4,1/1/2019 0:21,Bruen-Yost,misc_pos,6.85,Freedom,WY,43.0172,-111.0292,471,"Education officer, museum",8/2/1967,f3c43d336e92a44fc2fb67058d5949e3,43.753735,-111.454923,0


## **Step 3: Get the dataset information**

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 15 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  339607 non-null  object 
 1   merchant               339607 non-null  object 
 2   category               339607 non-null  object 
 3   amt                    339607 non-null  float64
 4   city                   339607 non-null  object 
 5   state                  339607 non-null  object 
 6   lat                    339607 non-null  float64
 7   long                   339607 non-null  float64
 8   city_pop               339607 non-null  int64  
 9   job                    339607 non-null  object 
 10  dob                    339607 non-null  object 
 11  trans_num              339607 non-null  object 
 12  merch_lat              339607 non-null  float64
 13  merch_long             339607 non-null  float64
 14  is_fraud               339607 non-nu

In [39]:
df.head()

Unnamed: 0,trans_date_trans_time,merchant,category,amt,city,state,lat,long,city_pop,job,dob,trans_num,merch_lat,merch_long,is_fraud
0,1/1/2019 0:00,"Heller, Gutmann and Zieme",grocery_pos,107.23,Orient,WA,48.8878,-118.2105,149,Special educational needs teacher,6/21/1978,1f76529f8574734946361c461b024d99,49.159047,-118.186462,0
1,1/1/2019 0:00,Lind-Buckridge,entertainment,220.11,Malad City,ID,42.1808,-112.262,4154,Nature conservation officer,1/19/1962,a1a22d70485983eac12b5b88dad1cf95,43.150704,-112.154481,0
2,1/1/2019 0:07,Kiehn Inc,grocery_pos,96.29,Grenada,CA,41.6125,-122.5258,589,Systems analyst,12/21/1945,413636e759663f264aae1819a4d4f231,41.65752,-122.230347,0
3,1/1/2019 0:09,Beier-Hyatt,shopping_pos,7.77,High Rolls Mountain Park,NM,32.9396,-105.8189,899,Naval architect,8/30/1967,8a6293af5ed278dea14448ded2685fea,32.863258,-106.520205,0
4,1/1/2019 0:21,Bruen-Yost,misc_pos,6.85,Freedom,WY,43.0172,-111.0292,471,"Education officer, museum",8/2/1967,f3c43d336e92a44fc2fb67058d5949e3,43.753735,-111.454923,0


# **Step 4: Feature Engineering**

In [40]:
# Drop less important features
less_important_columns = ['job', 'dob', 'trans_num', 'merch_lat', 'merch_long']
df = df.drop(less_important_columns, axis=1)

# Separate into features (X) and target variable (y)
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']


In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 9 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   trans_date_trans_time  339607 non-null  object 
 1   merchant               339607 non-null  object 
 2   category               339607 non-null  object 
 3   amt                    339607 non-null  float64
 4   city                   339607 non-null  object 
 5   state                  339607 non-null  object 
 6   lat                    339607 non-null  float64
 7   long                   339607 non-null  float64
 8   city_pop               339607 non-null  int64  
dtypes: float64(3), int64(1), object(5)
memory usage: 23.3+ MB


In [43]:
# Convert 'trans_date_trans_time' to timestamp
X['trans_date_trans_time'] = pd.to_datetime(X['trans_date_trans_time'])

# Extract relevant features from timestamp
X['hour'] = X['trans_date_trans_time'].dt.hour
X['day'] = X['trans_date_trans_time'].dt.day
X['month'] = X['trans_date_trans_time'].dt.month
X['year'] = X['trans_date_trans_time'].dt.year

# Drop the original 'trans_date_trans_time' column
X = X.drop('trans_date_trans_time', axis=1)


In [47]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
X['merchant'] = label_encoder.fit_transform(X['merchant'])
X['category'] = label_encoder.fit_transform(X['category'])
X['city'] = label_encoder.fit_transform(X['city'])
X['state'] = label_encoder.fit_transform(X['state'])


In [51]:
# Now view cleaned data
print(X.info())
X.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339607 entries, 0 to 339606
Data columns (total 12 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   merchant  339607 non-null  int64  
 1   category  339607 non-null  int64  
 2   amt       339607 non-null  float64
 3   city      339607 non-null  int64  
 4   state     339607 non-null  int64  
 5   lat       339607 non-null  float64
 6   long      339607 non-null  float64
 7   city_pop  339607 non-null  int64  
 8   hour      339607 non-null  int64  
 9   day       339607 non-null  int64  
 10  month     339607 non-null  int64  
 11  year      339607 non-null  int64  
dtypes: float64(3), int64(9)
memory usage: 31.1 MB
None


Unnamed: 0,merchant,category,amt,city,state,lat,long,city_pop,hour,day,month,year
0,241,4,107.23,116,11,48.8878,-118.2105,149,0,1,1,2019
1,390,0,220.11,93,5,42.1808,-112.262,4154,0,1,1,2019
2,308,4,96.29,51,2,41.6125,-122.5258,589,0,1,1,2019
3,42,12,7.77,55,8,32.9396,-105.8189,899,0,1,1,2019
4,82,9,6.85,46,12,43.0172,-111.0292,471,0,1,1,2019


# **Model Creation, Training, Testing and Accuracy Calculation**

In [53]:
#Split the Data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [54]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [55]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [56]:
y_pred = model.predict(X_test)

from sklearn.metrics import classification_report, confusion_matrix

print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Confusion Matrix:
 [[67539    16]
 [  143   224]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     67555
           1       0.93      0.61      0.74       367

    accuracy                           1.00     67922
   macro avg       0.97      0.81      0.87     67922
weighted avg       1.00      1.00      1.00     67922



In [57]:
from sklearn.metrics import accuracy_score

# Assuming y_pred is the predicted values
accuracy = accuracy_score(y_test, y_pred)

print("Model Accuracy: {:.2%}".format(accuracy))


Model Accuracy: 99.77%
