In [1]:
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display, HTML

In [2]:
#Parametros de ejecucion del programa
nombreArchivoEstructuraDataset = "./Mapas de Transformación de Datos/Datos en Pasos Intermedios/Estructura del Dataset.txt"
nombreArchivoDatasetSinEtiquetar = "./Dataset sin Etiquetar.csv"
nombreArchivoDatasetEtiquetado = "./Dataset Etiquetado.csv"
nombreColumnaDuplicadaSalida = "Interested in buying a mobile home policy".replace(" ", "_")
nombreColumnaSalida = "Number of mobile home policies".replace(" ", "_")
nombreColumnaFase = "Phase"
etiquetaDatosEntrenamiento = "Training"
etiquetaDatosPrueba = "Test"
etiquetaVariablesCategoricas = "Categorical"
etiquetaFrecuencia = "Frecuencia"
etiquetaFrecuenciaAbsoluta = "Frecuencia Absoluta"
etiquetaFrecuenciaRelativa = "Frecuencia Relativa"
etiquetaTotalGeneral = "Total General"
confidenceRatioForIndependeceHypothesisTest = 0.99

colorTextoVerde = (35, 155, 86)
colorTextoRojo = (176, 58, 46)

In [3]:
def printColouredText(text, foregroundColor = (0, 0, 0), backgroundColor = (255, 255, 255)):
    return "\033[48;2;{};{};{}m\033[38;2;{};{};{}m{}\033[38;2;0;0;0m\033[48;2;255;255;255m".format(backgroundColor[0], backgroundColor[1], backgroundColor[2], foregroundColor[0], foregroundColor[1], foregroundColor[2], text)

def highlightCountingPivotTableFor2DetailLevels(x):
    df = x.copy()
    k = list(df.columns.to_flat_index())
    df.loc[:, :] = "background-color: white!0"
    df[[k[-3:]]] = "font-weight: bold"
    df.iloc[-1, :] = "font-weight: bold"
    
    for a in cellsIndexesWithZeroValuesBySelectedColumns.keys():
        for ii in cellsIndexesWithZeroValuesBySelectedColumns[a]:
            df.loc[df.index[ii], a] = "background-color: rgb(245, 183, 177); font-weight: bold"
            
    for a in cellsIndexesWithMaximumValueBySelectedColumns.keys():
        for ii in cellsIndexesWithMaximumValueBySelectedColumns[a]:
            df.loc[df.index[ii], a] = "background-color: rgb(171, 235, 198); font-weight: bold"
    
    for a in cellsIndexesWithMinimumValueBySelectedColumns.keys():
        for ii in cellsIndexesWithMinimumValueBySelectedColumns[a]:
            df.loc[df.index[ii], a] = "background-color: rgb(249, 231, 159); font-weight: bold"
    
    return df

def buildFrequenciesTableFor1DetailLevel(dataframeSource, targetColumn, labelAbsoluteFrequency = "Absolute Frequency", labelRelativeFrequency = "Relative Frequency", labelGeneralTotal = "General Total"):
    #Construccion inicial de la tabla de frecuencias (no se cuenta con el total general)
    fullFrequenciesTable = dataframeSource.groupby([targetColumn]).size().reset_index(name = labelAbsoluteFrequency)
    variableNumberValues = len(fullFrequenciesTable)
    
    #Obtencion del total general (Suma de los valores que se tienen columna por columna)
    valuesSumTableByColumns = fullFrequenciesTable.sum()
    valuesSumTableByColumns.name = ""
    totalValuesList = valuesSumTableByColumns.tolist()
    valuesSumTableByColumns = pd.DataFrame({}, columns = fullFrequenciesTable.columns)
    valuesSumTableByColumns.loc[0] = totalValuesList
    valuesSumTableByColumns.iloc[0, 0] = labelGeneralTotal
    fullFrequenciesTable = pd.concat([fullFrequenciesTable, valuesSumTableByColumns], axis = 0).reset_index(drop = True)
    
    #Obtencion de frecuencias relativas
    valuesTotalCount = fullFrequenciesTable.loc[variableNumberValues, labelAbsoluteFrequency]
    relativeFrequenciesList = [absoluteFrequency / valuesTotalCount for absoluteFrequency in fullFrequenciesTable["Frecuencia Absoluta"].tolist()]
    fullFrequenciesTable[labelRelativeFrequency] = relativeFrequenciesList
    return fullFrequenciesTable

def buildCountingPivotTableFor2DetailLevels(dataframeSource, valuesColumn, indexColumn, mainColumnToGroup, secondaryColumnToGroup, labelGeneralTotal = "General Total"):
    #Construccion inicial de las tablas dinamicas, por 1 y por 2 niveles de detalle (no se cuenta con el total general)
    pivotTableWith2DetailLevels = pd.pivot_table(dataframeSource, values = valuesColumn, index = indexColumn, columns = [secondaryColumnToGroup, mainColumnToGroup], aggfunc = 'count')
    pivotTableWith2DetailLevels.fillna(0.0, inplace = True)
    pivotTableWith1DetailLevel = pd.pivot_table(dataframeSource, values = valuesColumn, index = indexColumn, columns = mainColumnToGroup, aggfunc = 'count')
    pivotTableWith1DetailLevel.fillna(0.0, inplace = True)
    pivotTableWith1DetailLevel[labelGeneralTotal] = pivotTableWith1DetailLevel[pivotTableWith1DetailLevel.columns].sum(axis = 1) #Suma de los valores que se tienen fila por fila
    frequencyTableWith2DetailLevels = pivotTableWith1DetailLevel.copy()
    pivotTableWith1DetailLevel.rename(columns = {tableColumn: ("", tableColumn) for tableColumn in pivotTableWith1DetailLevel.columns}, inplace = True)
    fullPivotTable = pd.concat([pivotTableWith2DetailLevels, pivotTableWith1DetailLevel], axis = 1)
    
    #Obtencion del total general (Suma de los valores que se tienen columna por columna)
    valuesSumTableByColumns = fullPivotTable.sum()
    valuesSumTableByColumns.name = labelGeneralTotal
    totalValuesList = valuesSumTableByColumns.tolist()
    valuesSumTableByColumns = pd.DataFrame({}, columns = fullPivotTable.columns, index = pd.Index([], name = indexColumn))
    valuesSumTableByColumns.loc[labelGeneralTotal] = totalValuesList
    fullPivotTable = pd.concat([fullPivotTable, valuesSumTableByColumns], axis = 0)
    
    for column in fullPivotTable.columns:
        fullPivotTable[column] = [int(value) for value in fullPivotTable[column].tolist()]
    
    #Obtencion de tabla de frecuencias a los 2 niveles de detalle
    frequencyTableWith2DetailLevels.loc[labelGeneralTotal] = [0.0] * len(list(frequencyTableWith2DetailLevels.columns))
    for column in frequencyTableWith2DetailLevels.columns:
        frequencyTableWith2DetailLevels[column] = fullPivotTable[("", column)]
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es 0
    cellsIndexesWithZeroValuesBySelectedColumns = {}
    for column in pivotTableWith1DetailLevel.columns:
        cellsIndexesWithZeroValuesBySelectedColumns[column] = []
        for rowIndex in range(len(fullPivotTable)):
            if fullPivotTable[column].tolist()[rowIndex] == 0:
                cellsIndexesWithZeroValuesBySelectedColumns[column].append(rowIndex)
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el maximo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMaximumValueBySelectedColumns = {}
    for column in pivotTableWith1DetailLevel.columns:
        cellsIndexesWithMaximumValueBySelectedColumns[column] = []
        maximumValue = -1
        indexesListWithMaximumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] > maximumValue:
                maximumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMaximumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == maximumValue:
                indexesListWithMaximumValue.append(rowIndex)
        cellsIndexesWithMaximumValueBySelectedColumns[column] = indexesListWithMaximumValue
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el minmo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMinimumValueBySelectedColumns = {}
    for column in pivotTableWith1DetailLevel.columns:
        cellsIndexesWithMinimumValueBySelectedColumns[column] = []
        minimumValue = float('inf')
        indexesListWithMinimumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] < minimumValue and fullPivotTable[column].tolist()[rowIndex] != 0:
                minimumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMinimumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == minimumValue:
                indexesListWithMinimumValue.append(rowIndex)
        cellsIndexesWithMinimumValueBySelectedColumns[column] = indexesListWithMinimumValue
    return frequencyTableWith2DetailLevels, fullPivotTable, cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns

def applyHypothesisTestForVariablesIndependence(frequencyTableWith2DetailLevels, indexColumn, valuesColumn, confidenceRatio = 0.95):
    expectedFrequencyTableWith2DetailLevels = frequencyTableWith2DetailLevels.copy()
    firstVariableNumberValues = len(frequencyTableWith2DetailLevels) - 1
    secondVariableNumberValues = len(frequencyTableWith2DetailLevels.columns) - 1
    numberTotalObservations = frequencyTableWith2DetailLevels.iloc[firstVariableNumberValues, secondVariableNumberValues]
    print("Se va a realizar una prueba de hipótesis de independencia de variables, para las variables", printColouredText(indexColumn, colorTextoVerde), "y", printColouredText(valuesColumn, colorTextoVerde), "con un valor de confianza de", printColouredText(confidenceRatio, colorTextoVerde))
    print("Se tiene un total de", printColouredText(numberTotalObservations, colorTextoVerde), "observaciones, la variable", printColouredText(indexColumn, colorTextoRojo), "toma", printColouredText(firstVariableNumberValues, colorTextoRojo), "valores posibles, y la variable", printColouredText(valuesColumn, colorTextoRojo), "toma", printColouredText(secondVariableNumberValues, colorTextoRojo), "valores posibles")
    
    #Construccion de la tabla de frecuencias esperadas
    for rowIndex in range(firstVariableNumberValues):
        for columnIndex in range(secondVariableNumberValues):
            expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex] = frequencyTableWith2DetailLevels.iloc[firstVariableNumberValues, columnIndex] * frequencyTableWith2DetailLevels.iloc[rowIndex, secondVariableNumberValues] / numberTotalObservations
    
    #Calculo de valores chi cuadrado de prueba y chi cuadrado critico
    numberFreedomDegrees = (firstVariableNumberValues - 1) * (secondVariableNumberValues - 1)
    chiSquaredCriticalValue = scipy.stats.chi2.ppf(confidenceRatio, numberFreedomDegrees)
    chiSquaredTestValue = 0.0
    for rowIndex in range(firstVariableNumberValues):
        for columnIndex in range(secondVariableNumberValues):
            chiSquaredTestValue += np.power(frequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex] - expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex], 2) / expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex]
    print("Se tiene(n)", printColouredText(numberFreedomDegrees, colorTextoVerde), "grado(s) de libertad, un valor estadístico crítico de", printColouredText(chiSquaredCriticalValue, colorTextoVerde), "y un valor estadístico de prueba de", printColouredText(chiSquaredTestValue, colorTextoVerde))
    
    #Se toma la decision de aceptar o rechazar la hipotesis de independencia de las 2 variables
    decisionConfirmation = True
    if chiSquaredCriticalValue < chiSquaredTestValue:
        print("Como el valor de prueba supera el valor crítico, por lo tanto se", printColouredText("rechaza", colorTextoRojo), "la hipótesis nula de independencia de las variables, es decir, se tiene dependencia entre las variables", printColouredText(indexColumn, colorTextoRojo), "y", printColouredText(valuesColumn, colorTextoRojo))
        decisionConfirmation = False
    else:
        print("Como el valor de prueba no supera el valor crítico, por lo tanto se", printColouredText("acepta", colorTextoRojo), "la hipótesis nula de independencia de las variables, es decir, se tiene independencia entre las variables", printColouredText(indexColumn, colorTextoRojo), "y", printColouredText(valuesColumn, colorTextoRojo))
    return decisionConfirmation, expectedFrequencyTableWith2DetailLevels

def plotFrequenciesProportionFor1DetailLevel(frequencyTableWith1DetailLevel, targetColumn, labelAbsoluteFrequency = "Absolute Frequency", labelRelativeFrequency = "Relative Frequency"):
    variableNumberValues = len(frequencyTableWith1DetailLevel) - 1
    
    #Obtencion de la gama de colores para las barras/sectores circulares
    barsColorsListInHexadecimal = None
    if variableNumberValues <= len(px.colors.qualitative.Pastel1):
        barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
        barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
        barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
    else:
        colorsMap = mpl.cm.get_cmap('turbo')
        barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = variableNumberValues))]
        barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
    
    labelsList = [str(frequencyTableWith1DetailLevel.loc[index, labelAbsoluteFrequency]) + " (" + str(np.round(frequencyTableWith1DetailLevel.loc[index, labelRelativeFrequency] * 100, decimals = 3)) + " %)" for index in range(variableNumberValues)]
    plt.pie(frequencyTableWith1DetailLevel[labelAbsoluteFrequency].tolist()[:variableNumberValues], labels = labelsList, colors = barsColorsListInHexadecimal, wedgeprops = {"linewidth": 1.0, "edgecolor": "black"})
    plt.legend(frequencyTableWith1DetailLevel.iloc[:variableNumberValues, 0], title = targetColumn, bbox_to_anchor = (1.35, 0.5), loc = "center left")
    plt.tight_layout()

def plotFrequenciesDistributionFor2DetailLevels(frequencyTableWith2DetailLevels, indexColumn, valuesColumn, barWidth = 1.0, barOffset = 0.0, barPlotOriginalOrientation = True, labelFrequency = "Frequency"):
    firstVariableNumberValues = len(frequencyTableWith2DetailLevels) - 1
    secondVariableNumberValues = len(frequencyTableWith2DetailLevels.columns) - 1
    firstVariableValuesList = list(frequencyTableWith2DetailLevels.index)[:firstVariableNumberValues]
    secondVariableValuesList = list(frequencyTableWith2DetailLevels.columns)[:secondVariableNumberValues]
    
    #Calculo de alturas de las barras respecto a la variable indicada
    barsHeightsListsGroupedByFirstVariable = {value: list(frequencyTableWith2DetailLevels.loc[value, frequencyTableWith2DetailLevels.columns[:secondVariableNumberValues]]) for value in firstVariableValuesList}
    barsHeightsListsGroupedBySecondVariable = {value: list(frequencyTableWith2DetailLevels.loc[frequencyTableWith2DetailLevels.index[:firstVariableNumberValues], value]) for value in secondVariableValuesList}
    
    #Calculo de posiciones de las barras respecto a la variable indicada
    barsPositionsListsGroupedByFirstVariable = {value: [(index * (len(firstVariableValuesList) + 1) + firstVariableValuesList.index(value)) * barWidth + index * barOffset for index in list(np.arange(secondVariableNumberValues))] for value in firstVariableValuesList}
    barsPositionsListsGroupedBySecondVariable = {value: [(index * (len(secondVariableValuesList) + 1) + secondVariableValuesList.index(value)) * barWidth + index * barOffset for index in list(np.arange(firstVariableNumberValues))] for value in secondVariableValuesList}
    
    #Calculo de posiciones de las etiquetas de valores de la variable indicada
    firstAndLastBarsPositionsListGroupedBySecondVariable = {value: [(secondVariableValuesList.index(value) * (len(firstVariableValuesList) + 1)) * barWidth + secondVariableValuesList.index(value) * barOffset, ((secondVariableValuesList.index(value) + 1) * (len(firstVariableValuesList) + 1) - 1) * barWidth + secondVariableValuesList.index(value) * barOffset] for value in secondVariableValuesList}
    firstAndLastBarsPositionsListGroupedByFirstVariable = {value: [(firstVariableValuesList.index(value) * (len(secondVariableValuesList) + 1)) * barWidth + firstVariableValuesList.index(value) * barOffset, ((firstVariableValuesList.index(value) + 1) * (len(secondVariableValuesList) + 1) - 1) * barWidth + firstVariableValuesList.index(value) * barOffset] for value in firstVariableValuesList}
    barsLabelsListGroupedBySecondVariable = [np.mean(firstAndLastBarsPositionsListGroupedBySecondVariable[value]) for value in secondVariableValuesList]
    barsLabelsListGroupedByFirstVariable = [np.mean(firstAndLastBarsPositionsListGroupedByFirstVariable[value]) for value in firstVariableValuesList]
    
    #Obtencion de parametros para la grafica del diagrama de barras agrupadas
    barsColorsListInHexadecimal = None
    barsHeightsListsGroupedByIndicatedVariable = None
    barsPositionsListsGroupedByIndicatedVariable = None
    legendLabelsValuesList = None
    plotXLabelsPositionsList = None
    plotXLabelsValuesList = None
    plotTitle = None
    plotXLabel = None
    legendTitle = None
    if barPlotOriginalOrientation == True:
        #Obtencion de la gama de colores para las barras/sectores circulares
        if secondVariableNumberValues <= len(px.colors.qualitative.Pastel1):
            barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
            barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
        else:
            colorsMap = mpl.cm.get_cmap('turbo')
            barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = secondVariableNumberValues))]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
        
        barsHeightsListsGroupedByIndicatedVariable = barsHeightsListsGroupedBySecondVariable
        barsPositionsListsGroupedByIndicatedVariable = barsPositionsListsGroupedBySecondVariable
        legendLabelsValuesList = secondVariableValuesList
        plotXLabelsPositionsList = barsLabelsListGroupedByFirstVariable
        plotXLabelsValuesList = firstVariableValuesList
        plotTitle = "Distribución de Frecuencias de " + valuesColumn + "\ncon respecto a " + indexColumn
        plotXLabel = valuesColumn
        legendTitle = indexColumn
    else:
        #Obtencion de la gama de colores para las barras
        if firstVariableNumberValues <= len(px.colors.qualitative.Pastel1):
            barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
            barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
        else:
            colorsMap = mpl.cm.get_cmap('turbo')
            barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = firstVariableNumberValues))]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
        
        barsHeightsListsGroupedByIndicatedVariable = barsHeightsListsGroupedByFirstVariable
        barsPositionsListsGroupedByIndicatedVariable = barsPositionsListsGroupedByFirstVariable
        legendLabelsValuesList = firstVariableValuesList
        plotXLabelsPositionsList = barsLabelsListGroupedBySecondVariable
        plotXLabelsValuesList = secondVariableValuesList
        plotTitle = "Distribución de Frecuencias de " + indexColumn + "\ncon respecto a " + valuesColumn
        plotXLabel = indexColumn
        legendTitle = valuesColumn
    
    plt.grid(True, zorder = 0)
    for value in legendLabelsValuesList:
        plt.bar(barsPositionsListsGroupedByIndicatedVariable[value], barsHeightsListsGroupedByIndicatedVariable[value], color = barsColorsListInHexadecimal[legendLabelsValuesList.index(value)], edgecolor = "black", width = barWidth, label = value, zorder = 3)
    plt.title(plotTitle)
    plt.xlabel(plotXLabel)
    plt.ylabel("Frecuencia")
    plt.xticks(plotXLabelsPositionsList, plotXLabelsValuesList)
    plt.legend(title = legendTitle, bbox_to_anchor = (1.0, 0.5), loc = "center left", borderaxespad = 1.0)
    plt.tight_layout()

In [4]:
structureDatasetFile = pd.read_csv(nombreArchivoEstructuraDataset, sep = ";", decimal = ".")
fieldNamesList = structureDatasetFile["Field Name"].tolist()
respectiveVariableKindsList = structureDatasetFile["Variable Kind"].tolist()

In [5]:
notLabeledDatasetFile = pd.read_csv(nombreArchivoDatasetSinEtiquetar, sep = ";", decimal = ".")

print("Se ha extraído la información del archivo", printColouredText(nombreArchivoDatasetSinEtiquetar, colorTextoRojo))
print("Se tiene(n)", printColouredText(str(len(notLabeledDatasetFile.index)) + " dato(s)", colorTextoVerde))

Se ha extraído la información del archivo [48;2;255;255;255m[38;2;176;58;46m./Dataset sin Etiquetar.csv[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m9822 dato(s)[38;2;0;0;0m[48;2;255;255;255m


In [6]:
notLabeledDatasetFile.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9822 entries, 0 to 9821
Data columns (total 87 columns):
 #   Column                                            Non-Null Count  Dtype 
---  ------                                            --------------  ----- 
 0   Customer Subtype                                  9822 non-null   int64 
 1   Number of houses                                  9822 non-null   int64 
 2   Avg size househol                                 9822 non-null   int64 
 3   Avg age                                           9822 non-null   int64 
 4   Customer main type                                9822 non-null   int64 
 5   Roman catholic                                    9822 non-null   int64 
 6   Protestant                                        9822 non-null   int64 
 7   Other religion                                    9822 non-null   int64 
 8   No religion                                       9822 non-null   int64 
 9   Married                       

In [7]:
labeledDatasetFile = pd.read_csv(nombreArchivoDatasetEtiquetado, sep = ";", decimal = ".")
labeledDatasetFile[nombreColumnaDuplicadaSalida] = labeledDatasetFile[nombreColumnaSalida].tolist()
labeledDatasetFile.replace({nombreColumnaDuplicadaSalida: {0: "No", 1: "Yes"}}, inplace = True)

#Filtrado de informacion para separar el dataset de entrenamiento y el dataset de prueba del dataset completo
trainingDatasetFile = labeledDatasetFile[labeledDatasetFile[nombreColumnaFase].str.contains(etiquetaDatosEntrenamiento)]
testDatasetFile = labeledDatasetFile[labeledDatasetFile[nombreColumnaFase].str.contains(etiquetaDatosPrueba)]

print("Se ha extraído la información del archivo", printColouredText(nombreArchivoDatasetEtiquetado, colorTextoRojo))
print("Se tiene(n)", printColouredText(str(len(labeledDatasetFile.index)) + " dato(s)", colorTextoVerde))
print("Se tiene(n) de", printColouredText(str(len(trainingDatasetFile.index)) + " dato(s) de entrenamiento", colorTextoVerde))
print("Se tiene(n) de", printColouredText(str(len(testDatasetFile.index)) + " dato(s) de prueba", colorTextoVerde))

Se ha extraído la información del archivo [48;2;255;255;255m[38;2;176;58;46m./Dataset Etiquetado.csv[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m9822 dato(s)[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) de [48;2;255;255;255m[38;2;35;155;86m5822 dato(s) de entrenamiento[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) de [48;2;255;255;255m[38;2;35;155;86m4000 dato(s) de prueba[38;2;0;0;0m[48;2;255;255;255m


In [8]:
%matplotlib notebook

tablaFrecuenciasColumnaSalida = buildFrequenciesTableFor1DetailLevel(labeledDatasetFile, nombreColumnaDuplicadaSalida, labelAbsoluteFrequency = etiquetaFrecuenciaAbsoluta, labelRelativeFrequency = etiquetaFrecuenciaRelativa, labelGeneralTotal = etiquetaTotalGeneral)
display(tablaFrecuenciasColumnaSalida)
numeroValoresColumnaSalida = len(tablaFrecuenciasColumnaSalida) - 1

plt.figure(num = 1, figsize = (9.5, 3.5))
plotFrequenciesProportionFor1DetailLevel(tablaFrecuenciasColumnaSalida, nombreColumnaDuplicadaSalida, labelAbsoluteFrequency = etiquetaFrecuenciaAbsoluta, labelRelativeFrequency = etiquetaFrecuenciaRelativa)
plt.show()

Unnamed: 0,Interested_in_buying_a_mobile_home_policy,Frecuencia Absoluta,Frecuencia Relativa
0,No,9236,0.940338
1,Yes,586,0.059662
2,Total General,9822,1.0


<IPython.core.display.Javascript object>

In [9]:
pivotTablesList = {}

prefix = "Label_"
variableIndex = 1
for nombreColumna in notLabeledDatasetFile.columns:
    if nombreColumna != nombreColumnaSalida.replace("_", " ").replace("-", " ") and nombreColumna != nombreColumnaDuplicadaSalida.replace("_", " ").replace("-", " ") and nombreColumna != nombreColumnaFase:
        indexCol = fieldNamesList.index(nombreColumna)
        variableKind = respectiveVariableKindsList[indexCol]
        
        if variableKind == etiquetaVariablesCategoricas:
            print("frequencyTableWithBothDetailLevels, pivotTablesList[\"" + nombreColumna + "\"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildCountingPivotTableFor2DetailLevels(labeledDatasetFile, nombreColumnaSalida, \"" + prefix + nombreColumna.replace(" ", "_").replace("-", "_") + "\", nombreColumnaDuplicadaSalida, nombreColumnaFase)")
        else:
            print("frequencyTableWithBothDetailLevels, pivotTablesList[\"" + nombreColumna + "\"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildCountingPivotTableFor2DetailLevels(labeledDatasetFile, nombreColumnaSalida, \"" + nombreColumna.replace(" ", "_").replace("-", "_") + "\", nombreColumnaDuplicadaSalida, nombreColumnaFase)")
        print("display(pivotTablesList[\"" + nombreColumna + "\"].style.apply(highlightCountingPivotTableFor2DetailLevels, axis = None))\n")
        print("variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(frequencyTableWithBothDetailLevels, \"" + nombreColumna + "\", nombreColumnaDuplicadaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)")
        print("if variablesIndependenceDecisionConfirmation == True:")
        print("\toutputIndependentVariableslist.append(\"" + nombreColumna + "\")")
        print("else:")
        print("\toutputDependentVariableslist.append(\"" + nombreColumna + "\")\n")
        print("plt.figure(num = " + str(variableIndex + 1) + ", figsize = (9.5, 10))")
        print("plotFrequenciesDistributionFor2DetailLevels(frequencyTableWithBothDetailLevels, nombreColumnaDuplicadaSalida, \"" + nombreColumna + "\", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)")
        print("plt.show()")
        print("\n" + printColouredText("-" * 100, colorTextoRojo) + "\n")
        variableIndex += 1

outputIndependentVariableslist = []
outputDependentVariableslist = []

frequencyTableWithBothDetailLevels, pivotTablesList["Customer Subtype"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildCountingPivotTableFor2DetailLevels(labeledDatasetFile, nombreColumnaSalida, "Label_Customer_Subtype", nombreColumnaDuplicadaSalida, nombreColumnaFase)
display(pivotTablesList["Customer Subtype"].style.apply(highlightCountingPivotTableFor2DetailLevels, axis = None))

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(frequencyTableWithBothDetailLevels, "Customer Subtype", nombreColumnaDuplicadaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
	outputIndependentVariableslist.append("Customer Subtype")
else:
	outputDependentVariableslist.append("Customer Subtype")

plt.figure(num = 2, figsize = (9.5, 10))
plotFrequenciesD

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(frequencyTableWithBothDetailLevels, "Number of surfboard policies", nombreColumnaDuplicadaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
	outputIndependentVariableslist.append("Number of surfboard policies")
else:
	outputDependentVariableslist.append("Number of surfboard policies")

plt.figure(num = 82, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(frequencyTableWithBothDetailLevels, nombreColumnaDuplicadaSalida, "Number of surfboard policies", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)
plt.show()

[48;2;255;255;255m[38;2;176;58;46m----------------------------------------------------------------------------------------------------[38;2;0;0;0m[48;2;255;255;255m

frequencyTableWithBothDetailLevels, pivotTablesList["Number of boat pol

In [10]:
frequencyTableWithBothDetailLevels, pivotTablesList["Customer Subtype"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildCountingPivotTableFor2DetailLevels(labeledDatasetFile, nombreColumnaSalida, "Label_Customer_Subtype", nombreColumnaDuplicadaSalida, nombreColumnaFase)
display(pivotTablesList["Customer Subtype"].style.apply(highlightCountingPivotTableFor2DetailLevels, axis = None))

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(frequencyTableWithBothDetailLevels, "Customer Subtype", nombreColumnaDuplicadaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
    outputIndependentVariableslist.append("Customer Subtype")
else:
    outputDependentVariableslist.append("Customer Subtype")

plt.figure(num = 2, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(frequencyTableWithBothDetailLevels, nombreColumnaDuplicadaSalida, "Customer Subtype", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False, labelFrequency = etiquetaFrecuencia)
plt.show()

Phase,Phase 1 [Training],Phase 1 [Training],Phase 2 [Test],Phase 2 [Test],Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0
Interested_in_buying_a_mobile_home_policy,No,Yes,No,Yes,No,Yes,General Total
Label_Customer_Subtype,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Affluent senior apartments,50,2,37,1,87,3,90
Affluent young families,95,16,71,12,166,28,194
Career and childcare,107,12,76,14,183,26,209
Couples with teens 'Married with children',209,16,137,11,346,27,373
Dinki's (double income no kids),41,3,26,2,67,5,72
Etnically diverse,23,2,17,0,40,2,42
Family starters,144,9,126,7,270,16,286
Fresh masters in the city,9,0,4,0,13,0,13
"High Income, expensive child",111,13,81,13,192,26,218
High status seniors,224,25,176,8,400,33,433


Se va a realizar una prueba de hipótesis de independencia de variables, para las variables [48;2;255;255;255m[38;2;35;155;86mCustomer Subtype[38;2;0;0;0m[48;2;255;255;255m y [48;2;255;255;255m[38;2;35;155;86mInterested_in_buying_a_mobile_home_policy[38;2;0;0;0m[48;2;255;255;255m con un valor de confianza de [48;2;255;255;255m[38;2;35;155;86m0.99[38;2;0;0;0m[48;2;255;255;255m
Se tiene un total de [48;2;255;255;255m[38;2;35;155;86m9822[38;2;0;0;0m[48;2;255;255;255m observaciones, la variable [48;2;255;255;255m[38;2;176;58;46mCustomer Subtype[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m39[38;2;0;0;0m[48;2;255;255;255m valores posibles, y la variable [48;2;255;255;255m[38;2;176;58;46mInterested_in_buying_a_mobile_home_policy[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m2[38;2;0;0;0m[48;2;255;255;255m valores posibles
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m38[38;2;0;0;0m[48;2;255;255;255m grado(s) de

<IPython.core.display.Javascript object>