## Deployment test funcs
this nb is only made to test the deployment functions

### importing libraries, modules

In [1]:
# libraries
import joblib
import numpy as np
import pandas as pd
import os
import sys
from typing import List, Dict

In [2]:
ROOT_DIR = os.path.abspath('..')
sys.path.append(ROOT_DIR)

# own modules 
from utils.utils_load_data        import Loader
from utils.utils_deployment_funcs import DeploymentFuncs

In [3]:
# instances
loader = Loader()
deploy = DeploymentFuncs()

In [4]:
# original data
df_raw = loader.load_data(file_name='breast_disease', dir= 'raw', copy= True)
df_raw.drop(columns= ['Unnamed: 32', 'id'], inplace= True)
df_raw['diagnosis'] = df_raw['diagnosis'].map({'M': 1, 'B': 0})

df_raw.head(4)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173


### Start of the analysis

base attributes: 
* 'radius'
* 'texture'
* 'perimeter'
* 'area'
* 'smoothness'
* 'compactness'
* 'concavity'
* 'concave points'
* 'symmetry'
* 'fractal dimension'
        

In [5]:
df_raw.columns

Index(['diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst'],
      dtype='object')

In [6]:
user_input = {#concave points 0 , 0.4
    'radius'    : [12.3, 12.5, 12.1, 12.6, np.nan, 12.2, 12.7, 12.8, 12.9, 13.0],#std-> ~3
    'texture'   : [19.1, 18.8, 15.3, 16.8, 21.3, 20.3, 17.3, 19.2, 19.9, 18.7],  #std-> ~4
    'perimeter' : [75.2, 76.1, 64.9, 67.0, 75.8, 84.7, 90.5, 78.0, 78.5, 79.0],  #std-> ~25
    'area'      : [450.3, 470.3, 440.2, 380.2, np.nan, 330.1, 490.1, 500.1, 510.1, 520.1],      #std-> ~350
    'smoothness': [0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.22],                 #std-> ~0.01
    'compactness'   : [0.14, 0.26, 0.11, 0.25, 0.12, 0.08, 0.55, 0.16, 0.24, 0.10],             #std-> ~0.05
    'concavity'     : [0.16, 0.33, 0.12, np.nan, np.nan, 0.15, 0.21, 0.19, 0.21, 0.12],         #std-> ~0.07
    'concave points': [0.24, 0.23, 0.25, 0.26, np.nan, np.nan, np.nan, 0.26, 0.23, 0.24],       #std-> ~0.03
    'symmetry'      : [0.16, 0.26, 0.15, 0.22, 0.18, np.nan, 0.23, 0.12, 0.29, 0.18],           #std-> ~0.02
    'fractal dimension': [0.059, 0.054, 0.058, 0.059, np.nan, 0.056, 0.054, 0.058, 0.058, 0.055]#std-> ~0.007
}
# add nulls bcs i hate myself :D
df_to_save = pd.DataFrame(user_input)
#loader.save_dataframe(df= df_to_save, file_name='new_data_for_prediction', dir= 'clean')

In [None]:
model_path = '../models/stacking_model_12.pkl'
        
base_attributes = [
            'radius', 'texture', 'perimeter', 'area','smoothness',
            'compactness', 'concavity', 'concave points',
            'symmetry', 'fractal_dimension'
        ]
        
ranges = { 
            # range calculation: min/~2, max * ~2
            'radius'   : (1.0, 60),
            'texture'  : (1, 51),
            'perimeter': (9, 400),
            'area'     : (50, 5001),
            'smoothness' : (0.025, 0.32),
            'compactness': (0.00095, 0.68),
            'concavity'  : (0.0000, 0.8),
            'concave points': (0.0000, 0.4),
            'symmetry'      : (0.05, 0.6),
            'fractal_dimension': (0.02, 0.18) 
        }
        # sorted columns (order used in training)
original_columns = [
            'radius_mean', 'texture_mean', 'perimeter_mean',      # mean values
            'area_mean', 'smoothness_mean', 'compactness_mean',
            'concavity_mean', 'concave points_mean', 'symmetry_mean',
            'fractal_dimension_mean',     
            'radius_se', 'texture_se', 'perimeter_se', 'area_se', # standard error values
            'smoothness_se',
            'compactness_se', 'concavity_se', 'concave points_se',    
            'symmetry_se', 'fractal_dimension_se', 'radius_worst',# worst values
            'texture_worst', 'perimeter_worst', 'area_worst',         
            'smoothness_worst', 'compactness_worst', 'concavity_worst',
            'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']                         



In [None]:
def process_file_input(file, header= False)-> pd.DataFrame:
        """Process CSV file uploaded by the user to generate new data for prediction
           - IMPORTANT: The file VALUES (measures) must have -> 
             (1) The same order as self.base_attributes
             (2) If the column names (headers) are not added, indicate it with the check mark ✅
             (3) Is NOT necessary to add the same amount of values for each attribute.
                 NULL -> (without measure in any instance)
                -- (BUT must have at least 5 values and each of them must be separated by commas)
                -- example: if -> 'val,NULL ,val' then -> 'val',, 'val'
            Args:
            - uploaded_file (file): not added this param yet
            - header (bool): if the file contains headers (important for the reading process)"""
        try:
            # detectar si es un csv o un xls (excel)
            if header:
                df = pd.read_csv(file, skipinitialspace= True)
            else:
                df = pd.read_csv(file, header= None)
            
            # validation: number of columns
            if len(df.columns) != len(base_attributes):
                raise ValueError(f'⚠️ ERROR: CSV file does not contain the required columns\n',
                                 f'- MESSAGE: in adition, make sure they are sorted as needed')
            
            # sort columns with order needed
            df.columns = base_attributes
            
            # null treatment
            df = df.fillna('nan')
            dict_nulls = df.to_dict('list')
            dict_new_data = {key: [val for val in values if val != 'nan']
                             for key, values in dict_nulls.items()}
            
            return dict_new_data
                
        except Exception as e:
            print(e)
        #     st.error(f'ERROR: can not process the CSV file: {e})
        
#- func 01-#-#-#-#-#-#–#-#-#-#–#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#   
def calculate_stats(measurements: List[int|float])-> Dict:
        """Calculate mean, Standard Error & worst(mean of the three largest values)
        Args:
            measurements (List): list of measurements converted to numpy array

        Returns:
            Dict: dictionary with mean, se and worst values
        >>> measurements = [1, 2, 3, 4, 5]
        >>> calculate_mean_se_worst(measurements)"""
        
        measurements_arr = np.array(measurements)
        mean  = np.mean(measurements_arr)
        worst = np.mean(np.sort(measurements_arr)[-3:])
        # sort in ascending order
        # [-3:] -> only the three largest values
        # else -> if there are less than 3 values, return the maximum value
        # sacar esta parte del código
        se    = np.std(measurements_arr, ddof= 1) / np.sqrt(len(measurements_arr))
        # ddof -> "delta degrees of freedom" -> '1' SAMPLE std, '0' -> POPULATION std
        # problema: columnas "fractal_dimension_mean, fractal_dimension_se, fractal_dimension_worst" = NaN
        results = {'mean': mean, 'se': se, 'worst': worst}
        print('SUCCESS: process finished in _calculate_mean_se_worst()')
        return results
    
#- func 02-#-#-#-#-#-#–#-#-#-#–#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#-#
def process_values(user_data: Dict)-> pd.DataFrame:
        """ """
        print(user_data)
        features = {}
        for attribute, measurements in user_data.items():
            # validations
            if not all(isinstance(val, (int, float)) for val in measurements):
                raise ValueError(f'⚠️ ERROR: Values in {attribute} must be numerical')
            
            if len(measurements) < 5:
                raise ValueError('⚠️ ERROR: At least 5 measurements are required in each attribute')
            
            # checkout ranges -> (if not found return -> None)
            min_range, max_range = ranges.get(attribute, (None, None))
            if min_range is not None and max_range is not None:
                if not all(min_range <= val <= max_range for val in measurements):
                    raise ValueError(f'⚠️ ERROR: Values in "{attribute.upper()}" are out of range'
                                     f'\n- Value expected between {min_range}|{max_range})')
            
            # calculate metrics
            stats = calculate_stats(measurements)
            
            # column names
            features[f'{attribute}_mean'] = stats['mean']
            features[f'{attribute}_se']   = stats['se']
            features[f'{attribute}_worst']= stats['worst']
            
        # df (original column order)
        df_new_data = pd.DataFrame([features], columns= original_columns)
        
        return df_new_data
    
def prediction(file= None, frontend_input = None, header= False):
        if file is not None:
            dict_new_data = process_file_input(file, header)
            print(dict_new_data)
        else:
             dict_new_data = frontend_input 
        
        df_processed = process_values(user_data= dict_new_data)
        
        # pasamos el return a este lugar para probar el código
        # prediction & probabilities
        model_path = '../models/stacking_model_12.pkl'
        model = joblib.load(model_path)
        model.set_params(smote= 'passthrough')
        
        pred = model.predict(df_processed)
        probabilities = model.predict_proba(df_processed) # only "malignant|1" probability
        print(probabilities)#prob_malignant = probabilities[]
        print(pred)


In [9]:
# test if the code works with a CSV with and without header
csv_with_header    = '../data/clean/new_data_M_with_header.csv'
csv_without_header = '../data/clean/new_data_B_no_header.csv'

# df_processed = prediction(file= csv_with_header, header= True)
# df_processed

In [10]:
# class testing
df_probe_00 = deploy.prediction(uploaded_file= csv_without_header, header= False)
df_probe_01 = deploy.prediction(uploaded_file= csv_with_header, header= True)

The prediction with the entered values is: BENIGN
Probabilities:
- benign   : 99.351897%
- malignant: 0.648%
The prediction with the entered values is: MALIGNANT
Probabilities:
- benign   : 1.252538%
- malignant: 98.747%


UserWarning: [01:39:00] WARNING: /Users/runner/work/xgboost/xgboost/src/gbm/../common/error_msg.h:80: 

* If you are loading a serialized model (like pickle in Python, RDS in R) or
configuration generated by an older version of XGBoost, please export the model by calling
`Booster.save_model` from that version first, then load it back in current version. See:

    https://xgboost.readthedocs.io/en/stable/tutorials/saving_model.html

* for more details about differences between saving model and serializing.

  warnings.warn(smsg, UserWarning)

### problem solved 
xgboost version:   2.1.3 *(model trained with this version)* vs 2.1.4 

In [11]:
input= {'radius': [6.3, 6.5, 6.1, 6.6, 6.2, 6.7, 6.8, 6.9, 6.0],
        'texture': [9.1, 12.8, 11.3, 10.8, 9.3, 9.3, 10.3, 11.2, 9.9, 11.7],
        'perimeter': [45.2, 36.1, 34.9, 47.0, 50.8, 44.7, 30.5, 48.0, 38.5, 49.0],
        'area': [200.3, 270.3, 240.2, 280.2, 230.1, 290.1, 200.1, 210.1, 220.1],
        'smoothness': [0.027011, 0.027012, 0.02702, 0.02707, 0.25073, 0.02701, 0.02706, 0.02703, 0.02705, 0.02709],
        'compactness': [0.14, 0.26, 0.11, 0.25, 0.12, 0.08, 0.55, 0.16, 0.24, 0.1],
        'concavity': [0.16, 0.33, 0.12, 0.15, 0.21, 0.19, 0.21, 0.12],
        'concave points': [0.24, 0.23, 0.25, 0.26, 0.26, 0.23, 0.24],
        'symmetry': [0.16, 0.26, 0.15, 0.22, 0.18, 0.23, 0.12, 0.29, 0.18],
        'fractal_dimension': [0.059, 0.054, 0.058, 0.059, 0.056, 0.054, 0.058, 0.058, 0.055]}

def process_values(user_data: Dict)-> pd.DataFrame:
        """ """
        features = {}
        for attribute, measurements in user_data.items():
            # validations
            if not all(isinstance(val, (int, float)) for val in measurements):
                raise ValueError(f'⚠️ ERROR: Values in {attribute} must be numerical')
                
            if len(measurements) < 5:
                raise ValueError('⚠️ ERROR: At least 5 measurements are required in each attribute')
                
            # checkout ranges -> (if not found return -> None)
            min_range, max_range = deploy.RANGES.get(attribute, (None, None))
            if min_range is not None and max_range is not None:
                if not all(min_range <= val <= max_range for val in measurements):
                    raise ValueError(f'⚠️ ERROR: Values in "{attribute.upper()}" are out of range'
                                     f'\n- Value expected between {min_range}|{max_range})')
                
            # calculate metrics
            stats = deploy._calculate_stats(measurements)
                
            # column names
            features[f'{attribute}_mean'] = stats['mean']
            features[f'{attribute}_se']   = stats['se']
            features[f'{attribute}_worst']= stats['worst']
                
        # df (original column order)
        df_new_data = pd.DataFrame([features], columns= deploy.ORIGINAL_COLUMNS) 
        return df_new_data
    
results = process_values(input)
results
        

Unnamed: 0,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,6.455556,10.57,42.47,237.944444,0.049408,0.201,0.18625,0.244286,0.198889,0.056778,...,6.8,11.933333,49.266667,280.2,0.10163,0.353333,0.25,0.256667,0.26,0.058667


In [None]:
def process_file_input( uploaded_file, header= False)-> pd.DataFrame:
    """Process CSV file uploaded by the user to generate new data for prediction"""
    try:
        file_name = uploaded_file.name.lower()
        
                # uploaded_file is a -> UploadedFile object not a string 'file.csv'
        
        if file_name.endswith('.csv'):
            df = pd.read_csv(
                uploaded_file, skipinitialspace= True, header= header) # estudiar cómo funciona esto y porqué 0          
                
        elif file_name.endswith(('.xls', '.xlsx')):
            df = pd.read_excel(uploaded_file, engine= 'openpyxl', header= header)
            
        else:
            raise ValueError('⚠️ERROR: file not suported,'
                                    'please upload a CSV or Excel file')
        # validation:  number of columns 
        #if not header:   
        if len(df.columns) != len(list_dicts.BASE_ATTRIBUTES):
            raise ValueError('ERROR: Missing columns in the file')      
                
        # sort columns with default names
        df.columns = list_dicts.BASE_ATTRIBUTES
                
        missing_cols = set(list_dicts.BASE_ATTRIBUTES) - set(df.columns)
        if missing_cols:
            raise ValueError(f'ERROR: Missing columns: {', '.join(missing_cols)}')
                
        # dict with base attributes
        processed_data = {attr:[] for attr in list_dicts.BASE_ATTRIBUTES}
                
        for _, row in df.iterows():
            for attr in list_dicts.BASE_ATTRIBUTES:
                val = row[attr]
                if pd.notna(val) and isinstance(val, (int, float)): #if not null add
                    processed_data[attr].append(val)
                            
        for attr, values in processed_data.items():
            if len(values) < 5:
                raise ValueError('ERROR: {attr} has less than 5 values'
                                ', add more measurements')
        return dict_new_data
                    
    except Exception as e:
        print(e)
        st.error(f'⚠️ ERROR: can not process the CSV file: {e}')
            