In [2]:
# Importing required libraries
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import datasets

import pytest
import ipytest
ipytest.autoconfig()

In [3]:
dataset = pd.read_csv(r"../../data/processed/TCGA_GBM_LGG_Mutations_clean_v2.csv") #change path when testing

dataset_df = pd.DataFrame(dataset, columns=dataset.columns)
dataset_df["Grade"] = dataset_df["Grade"].astype('category')

In [4]:
class SimplePipeline:
    def __init__(self):
        self.frame = None
        self.X_train, self.X_test, self.y_train, self.y_test = None, None, None, None
        self.model = None
        self.load_dataset()
    
    def load_dataset(self):
        """Loading the dataset, and make the train, test, split."""
        dataset = pd.read_csv(r"../../data/processed/TCGA_GBM_LGG_Mutations_clean_v2.csv") #change path when testing
        self.frame = pd.DataFrame(dataset, columns=dataset.columns)
        self.frame["Grade"] = self.frame["Grade"].astype('category')
        
        feature_names = [col for col in self.frame.columns if col != 'Grade']
        
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.frame[feature_names], self.frame['Grade'], test_size=0.65, random_state=42)
        
    def train(self, algorithm=LogisticRegression):
        self.model = algorithm(solver='lbfgs', multi_class='auto')
        self.model.fit(self.X_train, self.y_train)
        
    def predict(self, input_data):
        return self.model.predict(input_data)
        
    def get_accuracy(self):
        return self.model.score(X=self.X_test, y=self.y_test)
    
    def run_pipeline(self):
        """Execution method for running the pipeline several times."""
        self.load_dataset()
        self.train()

In [5]:
pipeline = SimplePipeline()
pipeline.run_pipeline()
accuracy_score = pipeline.get_accuracy()
print(f'The Accuracy of the model is: {accuracy_score}')

The Accuracy of the model is: 0.9157706093189965


In [9]:
# Defining the schema
tumor_schema = {
    'Grade': {
        'allowed_values': [0, 1],  # Ajusta según los valores de grado observados
        'dtype': int,
    },
    'Gender': {
        'allowed_values': [0, 1],  # 0 para femenino, 1 para masculino (si aplica)
        'dtype': int,
    },
    'Age_at_diagnosis': {
        'range': {
            'min': 0,
            'max': 120  # Rango típico de edad en años
        },
        'dtype': float,
    },
    'Race': {
        'allowed_values': [0, 1, 2, 3],  # Ajusta según las categorías de raza en el dataset
        'dtype': int,
    },
    'Tumor_Specification': {
        'allowed_values': [0, 1, 2],  # Ajusta según las especificaciones tumorales observadas
        'dtype': int,
    },
    # Genes mutados (0: No mutado, 1: Mutado)
    'PTEN': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'EGFR': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'CIC': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'MUC16': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'PIK3CA': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'NF1': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'PIK3R1': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'FUBP1': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'RB1': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'NOTCH1': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'BCOR': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'CSMD3': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'SMARCA4': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'GRIN2A': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'IDH2': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'FAT4': {
        'allowed_values': [0, 1],
        'dtype': int,
    },
    'PDGFRA': {
        'allowed_values': [0, 1],
        'dtype': int,
    }
}


In [10]:
@pytest.fixture
def pipeline():
    pl = SimplePipeline()
    pl.run_pipeline()
    return pl

In [15]:
%%ipytest

def test_input_data_ranges(pipeline):
    # Obtener los valores máximos y mínimos solo para las columnas numéricas
    numeric_columns = pipeline.frame.select_dtypes(include=['float64', 'int64']).columns
    max_values = pipeline.frame[numeric_columns].max()
    min_values = pipeline.frame[numeric_columns].min()
    
    # Asegurarse de que los valores máximos y mínimos estén dentro del rango esperado
    for feature in numeric_columns:
        if 'range' in tumor_schema[feature]:
            assert max_values[feature] <= tumor_schema[feature]['range']['max']
            assert min_values[feature] >= tumor_schema[feature]['range']['min']
    
    # Para las columnas categóricas, comprobar los valores permitidos
    categorical_columns = pipeline.frame.select_dtypes(include=['category']).columns
    for feature in categorical_columns:
        if 'allowed_values' in tumor_schema[feature]:
            unique_values = pipeline.frame[feature].cat.categories
            assert all(value in tumor_schema[feature]['allowed_values'] for value in unique_values)

def test_input_data_types(pipeline):
    # Obtener los tipos de datos de cada columna
    data_types = pipeline.frame.dtypes
    
    # Probar la compatibilidad entre los tipos de datos
    for feature in pipeline.frame.columns:
        expected_type = tumor_schema[feature]['dtype']
        # Comparar teniendo en cuenta que 'category' se usa para valores categóricos en lugar de int
        if data_types[feature].name == 'category':
            assert expected_type == int, f"{feature} debe ser de tipo categórico pero se esperaba int en el esquema"
        else:
            assert data_types[feature] == expected_type, f"{feature} tiene un tipo incompatible"


[32m.[0m[31mF[0m[31m                                                                                           [100%][0m
[31m[1m______________________________________ test_input_data_types ______________________________________[0m

pipeline = <__main__.SimplePipeline object at 0x000002E476F18AC0>

    [0m[94mdef[39;49;00m [92mtest_input_data_types[39;49;00m(pipeline):[90m[39;49;00m
        [90m# Obtener los tipos de datos de cada columna[39;49;00m[90m[39;49;00m
        data_types = pipeline.frame.dtypes[90m[39;49;00m
    [90m[39;49;00m
        [90m# Probar la compatibilidad entre los tipos de datos[39;49;00m[90m[39;49;00m
        [94mfor[39;49;00m feature [95min[39;49;00m pipeline.frame.columns:[90m[39;49;00m
            expected_type = tumor_schema[feature][[33m'[39;49;00m[33mdtype[39;49;00m[33m'[39;49;00m][90m[39;49;00m
            [90m# Comparar teniendo en cuenta que 'category' se usa para valores categóricos en lugar de int[39;49;00m[90