# <center>**Pruebas de validacion**</center>

# **Modulo Checker**

In [1]:
import pandas as pd
import sys
import os
import rich
from IPython.display import HTML
sys.path.insert(1, os.path.join(sys.path[0], '..'))

from smartdeploy.cl import checker

filename = "inference/X_inference.csv"
X_inference = pd.read_csv(filename)
X_inference.head()

Unnamed: 0,specimen_number,eccentricity,aspect_ratio,elongation,solidity,stochastic_convexity,isoperimetric_factor,maximal_indentation_depth,lobedness,average_intensity,average_contrast,smoothness,third_moment,uniformity,entropy
0,1,0.86224,2.0735,0.52269,0.98686,0.99474,0.70529,0.010097,0.018554,0.041404,0.12163,0.014579,0.004869,0.000276,0.9458
1,11,0.52382,1.1117,0.67175,0.54701,0.62982,0.15157,0.13674,3.4028,0.026434,0.085792,0.007306,0.002137,0.000166,0.90513
2,3,0.82866,1.9848,0.50917,0.9418,0.99825,0.55942,0.025524,0.11857,0.080103,0.16692,0.027107,0.008655,0.000427,1.8038
3,4,0.70668,1.251,0.38111,0.94226,0.99825,0.6925,0.019432,0.068724,0.031587,0.11502,0.013056,0.005311,8.6e-05,0.72247
4,4,0.73935,1.5319,0.34987,0.98479,1.0,0.81067,0.007808,0.011095,0.027888,0.11472,0.01299,0.006017,5e-05,0.59895


## Inferir tipo de input
- Tabular: pandas, numpy
- Image: numpy

In [9]:
input_type = checker.infer_input(X_inference)
print(f"Inferred input type: {input_type}")
assert input_type == 'tabular'

Inferred input type: tabular


## **Checker de Integridad** 
- Tabular: pandas, numpy
- Image: numpy

In [2]:
results = checker.integrity(X_inference, save_mlflow=False)
# results = checker.integrity(df, data_type=input_type)
df = pd.DataFrame(results)
HTML(df.to_html())





Unnamed: 0,name,value,description,pass
0,Is Single Value,"{'specimen_number': 10, 'eccentricity': 30, 'aspect_ratio': 30, 'elongation': 30, 'solidity': 30, 'stochastic_convexity': 14, 'isoperimetric_factor': 30, 'maximal_indentation_depth': 30, 'lobedness': 30, 'average_intensity': 29, 'average_contrast': 30, 'smoothness': 30, 'third_moment': 30, 'uniformity': 30, 'entropy': 30}",Check if there are columns which have only a single unique value in all rows.,True
1,Special Characters,"{'specimen_number': 0, 'eccentricity': 0, 'aspect_ratio': 0, 'elongation': 0, 'solidity': 0, 'stochastic_convexity': 0, 'isoperimetric_factor': 0, 'maximal_indentation_depth': 0, 'lobedness': 0, 'average_intensity': 0, 'average_contrast': 0, 'smoothness': 0, 'third_moment': 0, 'uniformity': 0, 'entropy': 0}",Search in column[s] for values that contains only special characters.,True
2,Mixed Nulls,"{'specimen_number': {}, 'eccentricity': {}, 'aspect_ratio': {}, 'elongation': {}, 'solidity': {}, 'stochastic_convexity': {}, 'isoperimetric_factor': {}, 'maximal_indentation_depth': {}, 'lobedness': {}, 'average_intensity': {}, 'average_contrast': {}, 'smoothness': {}, 'third_moment': {}, 'uniformity': {}, 'entropy': {}}","Search for various types of null values, including string representations of null.",True
3,Mixed Data Types,"{'specimen_number': {}, 'eccentricity': {}, 'aspect_ratio': {}, 'elongation': {}, 'solidity': {}, 'stochastic_convexity': {}, 'isoperimetric_factor': {}, 'maximal_indentation_depth': {}, 'lobedness': {}, 'average_intensity': {}, 'average_contrast': {}, 'smoothness': {}, 'third_moment': {}, 'uniformity': {}, 'entropy': {}}",Detect columns which contain a mix of numerical and string values.,True
4,String Mismatch,{},"Detect different variants of string categories (e.g. ""mislabeled"" vs ""mis-labeled"") in a categorical column.",True
5,Data Duplicates,0.0,Checks for duplicate samples in the dataset.,True
6,String Length Out Of Bounds,{},"Detect strings with length that is much longer/shorter than the identified ""normal"" string lengths.",True


## ¿ Pasa todos los tests de integridad ?

In [3]:
all(df["pass"])

True

## **Checker de Drift**

In [27]:
X_inference = X_inference.loc[:, X_inference.columns != 'specimen_number'].copy()
X_train = pd.read_csv("../smartdeploy/training/X_train.csv")
X_train = X_train.loc[:, X_train.columns != 'specimen_number'].copy()

In [28]:
results = checker.drift(X_inference, X_train)
rich.print(results)





## ¿ Pasa el test de drift ?

In [30]:
threshold = 0.2
results["domain_classifier_drift_score"] < threshold

True

# **Tracker**

In [3]:
from smartdeploy.cl import tracker


last_artifact = tracker.get_last_artifact(path="X_train.csv")
last_artifact

{'run_uuid': 'e7a5b744e8724a19a7619a128f18014d',
 'artifact_path': 's3://mlflow/0/e7a5b744e8724a19a7619a128f18014d/artifacts/X_train.csv'}

In [4]:
import mlflow

client = mlflow.MlflowClient()

client.list_artifacts(last_artifact["run_uuid"])

[<FileInfo: file_size=8153, is_dir=False, path='X_test.csv'>,
 <FileInfo: file_size=31991, is_dir=False, path='X_train.csv'>,
 <FileInfo: file_size=40875, is_dir=False, path='preprocessed_data.csv'>,
 <FileInfo: file_size=191, is_dir=False, path='y_test.csv'>,
 <FileInfo: file_size=750, is_dir=False, path='y_train.csv'>]

In [7]:
client.download_artifacts(last_artifact["run_uuid"],
                          path='X_train.csv')


'/tmp/tmp_jsexand/X_train.csv'

In [2]:
!pip install pyreadstat

Collecting pyreadstat
  Using cached pyreadstat-1.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)
Installing collected packages: pyreadstat
Successfully installed pyreadstat-1.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
# import sys
# import papermill

# notebooks = [
#   './training/install',
#   './training/load_data',
# ]

# for nbname in notebooks:
#     print('Updating', nbname)
#     papermill.execute_notebook(f'{nbname}.ipynb', '/tmp/output.ipynb', 
#                              stdout_file=sys.stdout, stderr_file=sys.stderr)