# Machine Learning Operations - Data Drift

This notebook was developed by:

- Bruna Simões (20240491)
- Daniel Caridade (20211588)
- Leonardo Di Caterina (20240485)
- Marco Galão (r20201545)

# 1. Libraries Importation

__`Step 1`__ Import the necessary libraries.

In [1]:
from utils import *

ModuleNotFoundError: No module named 'scipy'

In [None]:
# Helper function from the GitHub repository
# Path of project root
project_root = Path().resolve().parents[1]

# Add the project root directory to Python's module search path
sys.path.append(str(project_root))

# 2. Data Integration 

__`Step 2`__ Importing the dataset into the notebook, ignoring the first columns as it does not contain relevant information.

In [None]:
# Load dataset
data = pd.read_csv('../data/01_raw/data_v1.csv')
data

Unnamed: 0,trans_date_trans_time,cc_num,merchant,category,amt,first,last,gender,street,city,...,long,city_pop,job,dob,trans_num,unix_time,merch_lat,merch_long,is_fraud,merch_zipcode
0,2019-05-04 11:57:04,4586260469584,fraud_Kerluke Inc,misc_net,5.96,Melody,Thompson,F,0362 Anderson Wall,Mound City,...,-95.2138,1631,Architect,1953-01-20,3d21bce7967838c3988cfe0f7fca878a,1336132624,41.024651,-94.428240,0,50842.0
1,2019-12-14 08:55:21,4900628639996,fraud_Rempel PLC,grocery_net,70.66,Michael,Johnson,M,094 Owens Underpass,Norwalk,...,-118.0818,105549,Firefighter,1973-09-22,fda7712b4bbcaab36afded37ab55047f,1355475321,33.808771,-118.031888,0,90630.0
2,2019-03-30 05:21:33,676118385837,fraud_Rodriguez Group,gas_transport,50.92,Katelyn,Wise,F,674 Maureen Summit Apt. 276,Scotts Mills,...,-122.6187,1252,"Engineer, petroleum",1937-02-06,59161e0002642934974c1ae98bfa1f55,1333084893,44.561034,-123.281803,0,97331.0
3,2019-09-19 07:09:46,3596357274378601,fraud_Doyle Ltd,grocery_pos,71.68,David,Everett,M,4138 David Fall,Morrisdale,...,-78.2357,3688,Advice worker,1973-05-27,f487a7098c0bd4d45f710be1745c4acb,1348038586,41.612825,-78.316893,0,15834.0
4,2019-02-04 20:37:44,6011542681743618,fraud_Leffler-Goldner,personal_care,29.17,Emily,Hall,F,8851 Reese Neck,Basye,...,-78.7776,863,"Engineer, mining",1972-08-09,c2ed76f03cce8a6b362729a5a23f01c2,1328387864,39.387521,-79.674956,0,26444.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90762,2019-05-03 01:50:55,4861310130652566408,fraud_Thiel PLC,misc_pos,410.37,Ashley,Cruz,F,65417 Walsh Radial Suite 691,Saint Amant,...,-90.8435,10076,"Surveyor, rural practice",1977-12-16,14b94dc8026970fae77db1deb747f35f,1336009855,30.152101,-90.420021,0,70068.0
90763,2020-03-26 22:03:48,4560004149983868183,"fraud_Kihn, Brakus and Goyette",personal_care,38.94,Stacy,Villegas,F,20581 Pena Walks,Colorado Springs,...,-104.6556,525713,Museum/gallery exhibitions officer,1992-05-09,f93f924d08a19064ec37de7a69cf8e4b,1364335428,38.648246,-104.969889,0,80926.0
90764,2019-02-16 14:20:54,378904938837132,fraud_Gerhold LLC,home,29.43,Tina,Zimmerman,F,3595 Susan Island Suite 063,Thomas,...,-98.7388,1675,Barrister,1986-05-01,12177fa4ce00fc361366c28e0af96b81,1329402054,35.110496,-98.246154,0,73005.0
90765,2019-06-24 14:53:14,4155021259183870,fraud_Stark-Batz,entertainment,3.49,Renee,Parrish,F,174 Jennifer Meadow Apt. 467,Mountain Park,...,-98.9591,540,Research scientist (life sciences),1983-10-12,4a248f9b8268ba53241625b4af2a271c,1340549594,35.135938,-98.112255,0,73092.0


## 3. Data Drift


In [None]:
import numpy as np

def calculate_psi(expected, actual, buckettype='bins', buckets=10, axis=0):
    '''Calculate the PSI (population stability index) across all variables
    Args:
       expected: numpy matrix of original values
       actual: numpy matrix of new values, same size as expected
       buckettype: type of strategy for creating buckets, bins splits into even splits, quantiles splits into quantile buckets
       buckets: number of quantiles to use in bucketing variables
       axis: axis by which variables are defined, 0 for vertical, 1 for horizontal
    Returns:
       psi_values: ndarray of psi values for each variable
    Author:
       Matthew Burke
       github.com/mwburke
       worksofchart.com
    '''
    def psi(expected_array, actual_array, buckets):
        '''Calculate the PSI for a single variable
        Args:
           expected_array: numpy array of original values
           actual_array: numpy array of new values, same size as expected
           buckets: number of percentile ranges to bucket the values into
        Returns:
           psi_value: calculated PSI value
        '''
        def scale_range(input, min_val, max_val):
            input += -(np.min(input))
            input /= np.max(input) / (max_val - min_val)
            input += min_val
            return input

        breakpoints = np.arange(0, buckets + 1) / buckets * 100

        if buckettype == 'bins':
            breakpoints = scale_range(breakpoints, np.min(expected_array), np.max(expected_array))
        elif buckettype == 'quantiles':
            breakpoints = np.percentile(expected_array, breakpoints)

        expected_percents = np.histogram(expected_array, breakpoints)[0] / len(expected_array)
        actual_percents = np.histogram(actual_array, breakpoints)[0] / len(actual_array)

        def sub_psi(e_perc, a_perc):
            e_perc = max(e_perc, 0.0001)
            a_perc = max(a_perc, 0.0001)
            return (e_perc - a_perc) * np.log(e_perc / a_perc)

        psi_value = np.sum([sub_psi(expected_percents[i], actual_percents[i]) for i in range(len(expected_percents))])
        return psi_value

    # Fix initialization
    if len(expected.shape) == 1:
        psi_values = np.array([psi(expected, actual, buckets)])
    else:
        psi_values = np.zeros(expected.shape[1] if axis == 0 else expected.shape[0])

        for i in range(len(psi_values)):
            if axis == 0:
                psi_values[i] = psi(expected[:, i], actual[:, i], buckets)
            elif axis == 1:
                psi_values[i] = psi(expected[i, :], actual[i, :], buckets)

    return psi_values


In [None]:
def split_random(df):

    ref_data = df.sample(frac=0.8,random_state=200)
    ana_data = df.drop(ref_data.index)

    return ref_data, ana_data

In [None]:
ref_data, ana_data = split_random(data)

In [None]:
# Pick numeric columns
numeric_cols = ['amt', 'lat', 'long', 'city_pop', 'unix_time', 'merch_lat', 'merch_long']

psi_scores = calculate_psi(
    expected=ref_data[numeric_cols].values,
    actual=ana_data[numeric_cols].values,
    buckettype='quantiles',  # or 'bins'
    buckets=10,
    axis=0
)

# Pair scores with column names
psi_results = pd.Series(psi_scores, index=numeric_cols)
print("PSI Scores:\n", psi_results)

PSI Scores:
 amt           0.000428
lat           0.000597
long          0.001278
city_pop      0.000425
unix_time     0.000314
merch_lat     0.000646
merch_long    0.000935
dtype: float64


In [12]:
import nannyml as nml

categorical_cols = ['job', 'category', 'gender']
chunk_size = 50

# Set thresholds (optional - adjust the threshold value if needed)
threshold = nml.thresholds.ConstantThreshold(lower=None, upper=0.2)

calc = nml.UnivariateDriftCalculator(
    column_names=categorical_cols,
    treat_as_categorical=categorical_cols,
    chunk_size=chunk_size,
    categorical_methods=['jensen_shannon'],
    thresholds={"jensen_shannon": threshold}
)

calc.fit(ref_data)
result = calc.calculate(ana_data)
drift_df = result.filter(period='analysis').to_df()

# Save plot
plot = result.filter(period='analysis').plot(kind='drift')
plot.write_html("univariate_drift_nml.html")

ImportError: DLL load failed while importing _smoothers_lowess: The filename or extension is too long.

In [None]:
#uv pip install evidently
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

report = Report(metrics=[
    DataDriftPreset()
])

report.run(
    reference_data=ref_data,
    current_data=ana_data,
    column_mapping=None
)

report.save_html("data_drift_evidently.html")


In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)

for i, (train_idx, test_idx) in enumerate(kf.split(ref_data)):
    ref_fold = ref_data.iloc[train_idx]
    ana_fold = ref_data.iloc[test_idx]

    psi_scores = calculate_psi(ref_fold[numeric_cols].values, ana_fold[numeric_cols].values)
    print(f"Fold {i+1} PSI:\n", pd.Series(psi_scores, index=numeric_cols))

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
ref_scaled = scaler.fit_transform(ref_data[numeric_cols])
ana_scaled = scaler.transform(ana_data[numeric_cols])

pca = PCA(n_components=2)
ref_pca = pca.fit_transform(ref_scaled)
ana_pca = pca.transform(ana_scaled)

# Use PSI on PCA features
psi_pca = calculate_psi(ref_pca, ana_pca)
print("PSI on PCA Components:", psi_pca)