# **1.** **Import libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **2. Load dataset**
The creditcard.csv dataset contains details of 284,807 transactions out of which 492 are fraudulent transactions.

The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

* Data has 31 features from V1-V28 & Time, Amount and Class

* Input features: V1-V28, Time and Amount

* Target variable: Class

In [None]:
# Load dataset
df = pd.read_csv('/content/sample_data/creditcard.csv')

Display the top 15 rows of the dataframe

In [None]:
# Display first few rows
df.head(15)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0
5,2.0,-0.425966,0.960523,1.141109,-0.168252,0.420987,-0.029728,0.476201,0.260314,-0.568671,...,-0.208254,-0.559825,-0.026398,-0.371427,-0.232794,0.105915,0.253844,0.08108,3.67,0
6,4.0,1.229658,0.141004,0.045371,1.202613,0.191881,0.272708,-0.005159,0.081213,0.46496,...,-0.167716,-0.27071,-0.154104,-0.780055,0.750137,-0.257237,0.034507,0.005168,4.99,0
7,7.0,-0.644269,1.417964,1.07438,-0.492199,0.948934,0.428118,1.120631,-3.807864,0.615375,...,1.943465,-1.015455,0.057504,-0.649709,-0.415267,-0.051634,-1.206921,-1.085339,40.8,0
8,7.0,-0.894286,0.286157,-0.113192,-0.271526,2.669599,3.721818,0.370145,0.851084,-0.392048,...,-0.073425,-0.268092,-0.204233,1.011592,0.373205,-0.384157,0.011747,0.142404,93.2,0
9,9.0,-0.338262,1.119593,1.044367,-0.222187,0.499361,-0.246761,0.651583,0.069539,-0.736727,...,-0.246914,-0.633753,-0.120794,-0.38505,-0.069733,0.094199,0.246219,0.083076,3.68,0


# **3. Split data into train and test sets**
Split the data into training and test sets using the train_test_split function. Specify X as the input features, y as the target variable, set test_size to 0.2 for an 80/20 split, and use random_state for reproducibility.

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(['Class'], axis=1)
y = df['Class']
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (227845, 30)
X_test shape: (56962, 30)
y_train shape: (227845,)
y_test shape: (56962,)


# **4. Dealing with Imbalanced data**
How SMOTE Works

Identify Minority Class: SMOTE identifies the instances in the minority class. Choose Neighbors: For each instance in the minority class, it selects a number of its nearest neighbors (typically using Euclidean distance).
Generate Synthetic Samples: New samples are created by interpolating between the instance and its neighbors.

In [None]:
from imblearn.over_sampling import SMOTE

# Handle class imbalance with SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)
print("X_resampled shape:", X_resampled.shape)
print("y_resampled shape:", y_resampled.shape)

X_resampled shape: (454902, 30)
y_resampled shape: (454902,)


# **5. Scaling**
Scaling is a crucial preprocessing step in machine learning and data analysis.It helps improve model performance, ensures features contribute equally and helps us achieve more reliable and interpretable models.

In [None]:
from sklearn.preprocessing import StandardScaler

# Scale the features
scaler = StandardScaler()

X_resampled_scaled = scaler.fit_transform(X_resampled)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Convert back to DataFrame
processed_df = pd.DataFrame(X_resampled_scaled, columns=X.columns)
processed_df['Class'] = y_resampled

# Save the processed DataFrame to a CSV file
processed_df.to_csv('processed_data.csv', index=False)

# Display first few rows
processed_df.head(15)

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,1.159997,0.830117,-0.651074,0.546277,-0.637192,0.262856,0.388831,0.402723,-0.075962,1.079798,...,-0.060682,0.930435,0.080521,-0.419775,-0.099788,-0.556014,-0.047975,-0.226661,-0.382025,0
1,0.618522,0.368526,-0.723447,0.865849,-1.390147,0.365099,0.543696,0.523096,-0.177264,0.617065,...,-0.222274,0.404766,0.141708,-1.760366,-1.93124,0.73591,-0.503865,-0.467334,-0.219953,0
2,1.288504,0.461291,-0.297869,0.504187,-0.970725,0.667589,0.222573,0.666569,-0.057146,0.538746,...,-0.28876,-0.840314,0.111182,0.368843,-0.725205,0.195136,0.128866,0.062779,-0.372998,0
3,0.132396,0.342259,-0.240825,0.899225,0.145044,0.51985,1.05115,0.677759,-0.10312,0.76818,...,-0.132745,0.425662,-0.047433,0.923219,-1.087295,-0.476222,-0.199539,-0.2705,0.102926,0
4,-0.095434,-0.341874,0.01872,0.526667,-0.752552,0.174674,0.148961,0.438567,0.219825,0.843219,...,-0.357267,-0.222333,0.281155,0.217729,0.734922,0.475633,0.217172,-0.905761,-0.31085,0
5,-1.016556,0.291311,-0.226934,0.709588,-0.552398,0.532965,0.50067,0.637804,-0.064904,0.586117,...,-0.14801,0.30808,-0.224433,-1.346116,-0.286261,-0.600118,0.388057,0.550274,-0.245589,0
6,-0.243611,0.677233,-0.458258,0.66236,-0.397309,0.357452,0.183716,0.53649,-0.10523,0.593578,...,-0.113768,0.384256,-0.070968,0.284654,0.970521,-0.673854,-0.057366,-0.058131,-0.359864,0
7,0.84995,0.470492,-0.531209,0.604521,-0.947875,0.79223,1.385527,0.626609,-0.039733,0.720365,...,0.044737,1.797601,-0.054216,-3.42681,-1.024215,1.333539,-0.151859,-0.584177,-0.227175,0
8,0.131051,0.312875,-0.113299,0.628782,0.739315,0.74476,0.55562,0.864081,-0.367271,0.704122,...,-0.274154,0.585133,0.296481,-0.169183,-1.797488,0.193836,-0.676304,-0.20124,-0.15591,0
9,1.162541,0.45208,-0.632363,0.751054,-1.553591,0.256877,-0.081215,0.521672,-0.146218,-0.442146,...,-0.178836,0.364027,-0.073598,0.097975,-0.726998,-0.580436,0.109592,0.325174,-0.381979,0


Download processed data

In [None]:
from google.colab import files

# Download the CSV file
files.download('processed_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>