# Project: Clustering (Gaussian Mixture Model (GMM))
## Name: Melody Goldanloo

In [37]:
#Example of supress warnings for Numpy version out of range (optional)
import warnings
warnings.filterwarnings("ignore", category=Warning)
warnings.simplefilter(action='ignore', category=FutureWarning)

#Some recommended libraries
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, silhouette_score
import matplotlib.pyplot as plt

# The Dataset

In [38]:
# Read the data and add some error handling for file not found
try:
    df = pd.read_csv("creditcard.csv")
except pd.errors.ParserError as e:
    print(f"Error reading file: {e}")
    
#For testing, can use a subset to speed things up
print(df.shape)
#df = df[:1000] or df = df[:10000]

df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


# Data Preprocessing
**Loading Data**: Load the Credit Card Fraud Detection dataset using Pandas. Review the data using .info(), .describe(), etc. 

**Handling Missing Values**: Identify and handle any missing values in the dataset (e.g., using mean or median imputation). This is KEY, the GMM model cannot accept data with missing data points.

**Feature Scaling**: Standardize the features to have a mean of 0 and a standard deviation of 1 for consistent training.

**Separate the Target**: The target of this dataset is “Class”, please ensure you remove it from the dataset.

**Train-Test Split**: Split the dataset into training and testing sets to evaluate the model's performance on unseen data.

### Exploration

In [39]:
# Exploration
df.info()
# all columns are floats except the column "Class" which has int types.
df.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,...,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.859575,1.168375e-15,3.416908e-16,-1.379537e-15,2.074095e-15,9.604066e-16,1.487313e-15,-5.556467e-16,1.213481e-16,-2.406331e-15,...,1.654067e-16,-3.568593e-16,2.578648e-16,4.473266e-15,5.340915e-16,1.683437e-15,-3.660091e-16,-1.22739e-16,88.349619,0.001727
std,47488.145955,1.958696,1.651309,1.516255,1.415869,1.380247,1.332271,1.237094,1.194353,1.098632,...,0.734524,0.7257016,0.6244603,0.6056471,0.5212781,0.482227,0.4036325,0.3300833,250.120109,0.041527
min,0.0,-56.40751,-72.71573,-48.32559,-5.683171,-113.7433,-26.16051,-43.55724,-73.21672,-13.43407,...,-34.83038,-10.93314,-44.80774,-2.836627,-10.2954,-2.604551,-22.56568,-15.43008,0.0,0.0
25%,54201.5,-0.9203734,-0.5985499,-0.8903648,-0.8486401,-0.6915971,-0.7682956,-0.5540759,-0.2086297,-0.6430976,...,-0.2283949,-0.5423504,-0.1618463,-0.3545861,-0.3171451,-0.3269839,-0.07083953,-0.05295979,5.6,0.0
50%,84692.0,0.0181088,0.06548556,0.1798463,-0.01984653,-0.05433583,-0.2741871,0.04010308,0.02235804,-0.05142873,...,-0.02945017,0.006781943,-0.01119293,0.04097606,0.0165935,-0.05213911,0.001342146,0.01124383,22.0,0.0
75%,139320.5,1.315642,0.8037239,1.027196,0.7433413,0.6119264,0.3985649,0.5704361,0.3273459,0.597139,...,0.1863772,0.5285536,0.1476421,0.4395266,0.3507156,0.2409522,0.09104512,0.07827995,77.165,0.0
max,172792.0,2.45493,22.05773,9.382558,16.87534,34.80167,73.30163,120.5895,20.00721,15.59499,...,27.20284,10.50309,22.52841,4.584549,7.519589,3.517346,31.6122,33.84781,25691.16,1.0


### Handling Missing Values

In [40]:
# Check for missing values
print("Sum of null values:\n", df.isnull().sum())
# no missing values!


## Train-Test Split
# The instructions for the project ask to split the data into training and testing sets, however the GMM
# example says not to do it, since it is unsupervised, so I am not going to do it.

Sum of null values:
 Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64


### Standardizing & Separating Target Variable

In [41]:
# First, separate target variable (Class)
X = df.drop(columns=["Class"]).copy()
scaler = StandardScaler()

# Now, standardize X
X_scaled = scaler.fit_transform(X)

# Building the GMM Model
**Model Initialization**:
- Number of Components: Decide on the number of Gaussian components (clusters) to use.
- Covariance Type: Choose the type of covariance matrix (e.g., full, tied, diagonal, spherical).

**Model Fitting**: Fit the GMM to the features of the real-world dataset for anomaly detection.

In [76]:
#Building the model
#gmm = GaussianMixture(n_components=2, covariance_type='full', random_state=42)
#gmm_labels = gmm.fit_predict(X_scaled)
    # Accuracy score of 0.368568
    # Precision score of 0.000284208
    # Recall of 0.103659

# 'tied' covariance type
#gmm_tied = GaussianMixture(n_components=4, covariance_type='tied', random_state=42)
# gmm_labels_tied = gmm_tied.fit_predict(X_scaled)
    # Accuracy score of 0.368569
    # Precision: 0.0002842081

# Using 'diag' covariance type
#gmm_diag = GaussianMixture(n_components=4, covariance_type='diag', random_state=42)
#gmm_labels_diag = gmm_diag.fit_predict(X_scaled)
    # Accuracy: 0.20713

# Using 'spherical' covariance type
gmm_spherical = GaussianMixture(n_components=4, covariance_type='spherical', random_state=42)
gmm_labels_spherical = gmm_spherical.fit_predict(X_scaled)
    # Accuracy score of 0.414418
    # Highest Score on both components=4 and spherical type

# Evaluating the Model
**Performance Metrics**: Evaluate the anomaly detection performance using metrics such as accuracy, precision, recall, and confusion matrix.

In [77]:
y_true = df['Class'].values  # Extract true labels

accuracy = accuracy_score(y_true, gmm_labels_spherical)
cm = confusion_matrix(y_true, gmm_labels_spherical)
#precision = precision_score(y_true, gmm_labels_spherical)
#recall = recall_score(y_true, gmm_labels)
#gmm_silhouette = silhouette_score(X_scaled, gmm_labels_spherical) (not working for me)

print("Accuracy:", accuracy)
print("Confusion Matrix:\n", cm)
#print("Precision:", precision)
#print("Recall:", recall)

Accuracy: 0.4144174827163658
Confusion Matrix:
 [[117971 137847  18393  10104]
 [    12     58      6    416]
 [     0      0      0      0]
 [     0      0      0      0]]


## For the Model Selection Project, you will STOP HERE! 
During Units 4, 5, and 6, we will explore and learn additional techniques, and then revisit these projects to apply the below:
- Model evaluation and parameter tuning
- Explanatory visualizations and package your results with data storytelling

# Tuning Model Parameters (Completed in Unit 4)

In [None]:
#Insert Code Here

# Evaluating the Tuned Model (Completed in Unit 4)

In [None]:
#Insert Code Here

# Visualizing Results (Completed in Units 4 and 6)

In [None]:
#Insert Code Here