In [1]:
import scipy.stats
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import display, HTML
from enum import *

In [2]:
#Parametros de ejecucion del programa
nombreArchivoEstructuraDataset = "./Estructura del Dataset.txt"
nombreArchivoDataset = "./IMDBMovies10ClassesIncompleto.csv"
nombreColumnaDuplicadaSalida = "correctMovieExpectationsDuplicated"
nombreColumnaSalida = "correctMovieExpectations"
etiquetaFrecuencia = "Frecuencia"
etiquetaFrecuenciaAbsoluta = "Frecuencia Absoluta"
etiquetaFrecuenciaRelativa = "Frecuencia Relativa"
etiquetaTotalGeneral = "Total General"
confidenceRatioForIndependeceHypothesisTest = 0.99
caracterSeparador = "|"

colorTextoVerde = (35, 155, 86)
colorTextoRojo = (176, 58, 46)

In [3]:
#Se controla por medio de estados los tipos de variables (Categoricas y Numericas (Discreta o Continua))
class TipoVariable(Enum):
    variableCategorica = "Categorical"
    variableCuantitativaDiscreta = "Discrete Numerical"
    variableCuantitativaContinua = "Continuous Numerical"

#Se controla por medio de estados los tipos de variables (Categoricas y Numericas (Discreta o Continua))
class CantidadValoresVariable(Enum):
    variableUnivalor = "Single Value"
    variableMultivalor = "Multiple Value"

#Se controla por medio de estados las etiquetas adicionales que pueden tener las variables (Categoricas y Numericas (Discreta o Continua))
class EtiquetaAdicionalVariables(Enum):
    etiquetaMapa = "Geographical"
    variableDatosAgrupados = "Grouped"

#Se controla por medio de estados el numero de intervalos para agrupar variables numericas, segun la forma principal (Regla de Sturges) o la forma alternativa (Regla de la raiz)
class CantidadIntervalosAgrupacion(Enum):
    reglaPrincipal = "Sturges"
    reglaAlternativa = "Root"

def printColouredText(text, foregroundColor = (0, 0, 0), backgroundColor = (255, 255, 255)):
    return "\033[48;2;{};{};{}m\033[38;2;{};{};{}m{}\033[38;2;0;0;0m\033[48;2;255;255;255m".format(backgroundColor[0], backgroundColor[1], backgroundColor[2], foregroundColor[0], foregroundColor[1], foregroundColor[2], text)

def highlightFrequenciesTableFor1DetailLevel(dataframeWithoutStyle):
    styledDataframe = dataframeWithoutStyle.copy()
    styledDataframe.loc[:, :] = "background-color: white!0" #Inicialmente, se deja el dataframe sin estilo, para esto se aplica un color de fondo transparente
    styledDataframe.iloc[:, -1] = "font-weight: bold" #Estilo aplicado a la ultima columna
    styledDataframe.iloc[-1, :] = "font-weight: bold" #Estilo aplicado a la ultima fila
    
    #Se aplica estilo a las celdas indicadas (indice, columna) para resaltar celdas donde se alcanza el maximo valor
    for column in cellsIndexesWithMaximumValueBySelectedColumns.keys():
        for numberRow in cellsIndexesWithMaximumValueBySelectedColumns[column]:
            styledDataframe.loc[styledDataframe.index[numberRow], column] = "background-color: rgb(171, 235, 198); font-weight: bold"
    
    #Se aplica estilo a las celdas indicadas (indice, columna) para resaltar celdas donde se alcanza el minimo valor
    for column in cellsIndexesWithMinimumValueBySelectedColumns.keys():
        for numberRow in cellsIndexesWithMinimumValueBySelectedColumns[column]:
            styledDataframe.loc[styledDataframe.index[numberRow], column] = "background-color: rgb(249, 231, 159); font-weight: bold"
    return styledDataframe

def highlightFrequenciesTableFor2DetailLevels(dataframeWithoutStyle):
    styledDataframe = dataframeWithoutStyle.copy()
    dataframeColumnsList = list(styledDataframe.columns.to_flat_index())
    styledDataframe.loc[:, :] = "background-color: white!0" #Inicialmente, se deja el dataframe sin estilo, para esto se aplica un color de fondo transparente
    styledDataframe[[dataframeColumnsList[-(numeroValoresColumnaSalida + 1):]]] = "font-weight: bold" #Estilo aplicado a las ultimas n + 1 columnas, donde n es el numero de valores que toma la variable de clasificacion
    styledDataframe.iloc[-1, :] = "font-weight: bold" #Estilo aplicado a la ultima fila
    
    #Se aplica estilo a las celdas indicadas (indice, columna) para resaltar celdas con valores iguales a 0
    for column in cellsIndexesWithZeroValuesBySelectedColumns.keys():
        for numberRow in cellsIndexesWithZeroValuesBySelectedColumns[column]:
            styledDataframe.loc[styledDataframe.index[numberRow], column] = "background-color: rgb(245, 183, 177); font-weight: bold"
    
    #Se aplica estilo a las celdas indicadas (indice, columna) para resaltar celdas donde se alcanza el maximo valor
    for column in cellsIndexesWithMaximumValueBySelectedColumns.keys():
        for numberRow in cellsIndexesWithMaximumValueBySelectedColumns[column]:
            styledDataframe.loc[styledDataframe.index[numberRow], column] = "background-color: rgb(171, 235, 198); font-weight: bold"
    
    #Se aplica estilo a las celdas indicadas (indice, columna) para resaltar celdas donde se alcanza el minimo valor (distinto de 0)
    for column in cellsIndexesWithMinimumValueBySelectedColumns.keys():
        for numberRow in cellsIndexesWithMinimumValueBySelectedColumns[column]:
            styledDataframe.loc[styledDataframe.index[numberRow], column] = "background-color: rgb(249, 231, 159); font-weight: bold"
    return styledDataframe

def buildFrequenciesTableFor1Variable(dataframeSource, indexColumn, labelAbsoluteFrequency = "Absolute Frequency", labelRelativeFrequency = "Relative Frequency", labelGeneralTotal = "General Total"):
    #Construccion inicial de la tabla de frecuencias (no se cuenta con el total general)
    fullFrequenciesTable = dataframeSource.groupby([indexColumn]).size().reset_index(name = labelAbsoluteFrequency)
    variableNumberValues = len(fullFrequenciesTable)
    
    #Obtencion del total general (Suma de los valores que se tienen columna por columna)
    valuesSumTableByColumns = fullFrequenciesTable.sum()
    valuesSumTableByColumns.name = ""
    totalValuesList = valuesSumTableByColumns.tolist()
    valuesSumTableByColumns = pd.DataFrame({}, columns = fullFrequenciesTable.columns)
    valuesSumTableByColumns.loc[0] = totalValuesList
    valuesSumTableByColumns.iloc[0, 0] = labelGeneralTotal
    fullFrequenciesTable = pd.concat([fullFrequenciesTable, valuesSumTableByColumns], axis = 0).reset_index(drop = True)
    fullFrequenciesTable.index = fullFrequenciesTable[indexColumn]
    fullFrequenciesTable.drop(indexColumn, inplace = True, axis = 1)
    
    #Obtencion de frecuencias relativas
    valuesTotalCount = fullFrequenciesTable.loc[fullFrequenciesTable.index[variableNumberValues], labelAbsoluteFrequency]
    relativeFrequenciesList = [absoluteFrequency / valuesTotalCount for absoluteFrequency in fullFrequenciesTable["Frecuencia Absoluta"].tolist()]
    fullFrequenciesTable[labelRelativeFrequency] = relativeFrequenciesList
    
    #Obtencion de celdas cuyo valor es el maximo en la frecuencia absoluta (y por lo tanto es el maximo en la frecuencia relativa)
    cellsIndexesWithMaximumValueBySelectedColumns = {labelAbsoluteFrequency: [], labelRelativeFrequency: []}
    maximumValue = -1
    indexesListWithMaximumValue = []
    for rowIndex in range(len(fullFrequenciesTable) - 1):
        if fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex] > maximumValue:
            maximumValue = fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex]
            indexesListWithMaximumValue = [rowIndex]
        elif fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex] == maximumValue:
            indexesListWithMaximumValue.append(rowIndex)
    cellsIndexesWithMaximumValueBySelectedColumns[labelAbsoluteFrequency] = indexesListWithMaximumValue
    cellsIndexesWithMaximumValueBySelectedColumns[labelRelativeFrequency] = indexesListWithMaximumValue
    
    #Obtencion de celdas cuyo valor es el minimo en la frecuencia absoluta (y por lo tanto es el minimo en la frecuencia relativa)
    cellsIndexesWithMinimumValueBySelectedColumns = {labelAbsoluteFrequency: [], labelRelativeFrequency: []}
    minimumValue = float('inf')
    indexesListWithMinimumValue = []
    for rowIndex in range(len(fullFrequenciesTable) - 1):
        if fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex] < minimumValue:
            minimumValue = fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex]
            indexesListWithMinimumValue = [rowIndex]
        elif fullFrequenciesTable[labelAbsoluteFrequency].tolist()[rowIndex] == minimumValue:
            indexesListWithMinimumValue.append(rowIndex)
    cellsIndexesWithMinimumValueBySelectedColumns[labelAbsoluteFrequency] = indexesListWithMinimumValue
    cellsIndexesWithMinimumValueBySelectedColumns[labelRelativeFrequency] = indexesListWithMinimumValue
    return fullFrequenciesTable, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns

def buildFrequenciesTableFor2DiscreteVariables(dataframeSource, valuesColumn, indexColumn, grouppingColumn, labelGeneralTotal = "General Total"):
    #Construccion inicial de la tabla dinamica, al nivel de detalle indicado (no se cuenta con el total general)
    fullPivotTable = pd.pivot_table(dataframeSource, values = valuesColumn, index = indexColumn, columns = grouppingColumn, aggfunc = 'count')
    fullPivotTable.fillna(0.0, inplace = True)
    fullPivotTable[labelGeneralTotal] = fullPivotTable[fullPivotTable.columns].sum(axis = 1) #Suma de los valores que se tienen fila por fila
    
    #Obtencion del total general (Suma de los valores que se tienen columna por columna)
    valuesSumTableByColumns = fullPivotTable.sum()
    valuesSumTableByColumns.name = labelGeneralTotal
    totalValuesList = valuesSumTableByColumns.tolist()
    valuesSumTableByColumns = pd.DataFrame({}, columns = fullPivotTable.columns, index = pd.Index([], name = indexColumn))
    valuesSumTableByColumns.loc[labelGeneralTotal] = totalValuesList
    fullPivotTable = pd.concat([fullPivotTable, valuesSumTableByColumns], axis = 0)
    
    for column in fullPivotTable.columns:
        fullPivotTable[column] = [int(value) for value in fullPivotTable[column].tolist()]
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es 0
    cellsIndexesWithZeroValuesBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithZeroValuesBySelectedColumns[column] = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] == 0:
                cellsIndexesWithZeroValuesBySelectedColumns[column].append(rowIndex)
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el maximo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMaximumValueBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithMaximumValueBySelectedColumns[column] = []
        maximumValue = -1
        indexesListWithMaximumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] > maximumValue:
                maximumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMaximumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == maximumValue:
                indexesListWithMaximumValue.append(rowIndex)
        cellsIndexesWithMaximumValueBySelectedColumns[column] = indexesListWithMaximumValue
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el minmo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMinimumValueBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithMinimumValueBySelectedColumns[column] = []
        minimumValue = float('inf')
        indexesListWithMinimumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] < minimumValue and fullPivotTable[column].tolist()[rowIndex] != 0:
                minimumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMinimumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == minimumValue:
                indexesListWithMinimumValue.append(rowIndex)
        cellsIndexesWithMinimumValueBySelectedColumns[column] = indexesListWithMinimumValue
    return fullPivotTable, cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns

def buildFrequenciesTableFor2VariablesWithMainGroupping(dataframeSource, valuesColumn, indexColumn, grouppingColumn, numberIntervalsRule = CantidadIntervalosAgrupacion.reglaPrincipal.value, integerGrouppingValues = False, labelGeneralTotal = "General Total"):
    #Construccion inicial de la tabla dinamica, al nivel de detalle indicado (no se cuenta con el total general)
    fullPivotTable = pd.pivot_table(dataframeSource, values = valuesColumn, index = indexColumn, columns = grouppingColumn, aggfunc = 'count')
    fullPivotTable.fillna(0.0, inplace = True)
    
    partitionStartValue = -1
    partitionFinalValue = -1
    if integerGrouppingValues == True:
        partitionStartValue = int(np.floor(np.min(dataframeSource[indexColumn].tolist())))
        partitionFinalValue = int(np.ceil(np.max(dataframeSource[indexColumn].tolist())))
    else:
        partitionStartValue = float(np.min(dataframeSource[indexColumn].tolist()))
        partitionFinalValue = float(np.max(dataframeSource[indexColumn].tolist()))
    
    numberIntervals = -1
    if numberIntervalsRule == CantidadIntervalosAgrupacion.reglaPrincipal.value:
        numberIntervals = int(1.0 + np.log2(len(dataframeSource[valuesColumn].tolist())))
        if numberIntervals % 2 == 0:
            numberIntervals += 1
    elif numberIntervalsRule == CantidadIntervalosAgrupacion.reglaAlternativa.value:
        numberIntervals = int(np.sqrt(len(dataframeSource[valuesColumn].tolist())))
        if numberIntervals % 2 == 0:
            numberIntervals += 1
    
    partitionList = list(np.linspace(partitionStartValue, partitionFinalValue, numberIntervals + 1))
    if integerGrouppingValues == True:
        partitionList = [int(np.round(value, decimals = 0)) for value in partitionList]
    
    intervalsList = None
    if len(partitionList) == 2:
        intervalsList = ["[" + str(partitionList[0]) + ", " + str(partitionList[1]) + "]"]
    else:
        intervalsList = ["[" + str(partitionList[index]) + ", " + str(partitionList[index + 1]) + ")" for index in range(len(partitionList) - 2)]
        intervalsList.append("[" + str(partitionList[len(partitionList) - 2]) + ", " + str(partitionList[len(partitionList) - 1]) + "]")
    
    dic = {}
    for column in fullPivotTable.columns:
        dic[column] = {}
        for interval in intervalsList:
            dic[column][interval] = []
        for indexRow in fullPivotTable.index:
            for index in range(len(intervalsList)):
                if index < len(intervalsList) - 1:
                    if partitionList[index] <= indexRow and indexRow < partitionList[index + 1]:
                        dic[column][intervalsList[index]].append(fullPivotTable.loc[indexRow, column])
                        break
                else:
                    if partitionList[index] <= indexRow and indexRow <= partitionList[index + 1]:
                        dic[column][intervalsList[index]].append(fullPivotTable.loc[indexRow, column])
    
    fullPivotTable.drop(fullPivotTable.index, inplace = True)
    for interval in intervalsList:
        fullPivotTable.loc[interval] = [0.0] * len(fullPivotTable.columns)
        for column in fullPivotTable.columns:
            dic[column][interval] = np.sum(dic[column][interval])
            fullPivotTable.loc[interval, column] = dic[column][interval]
    display(fullPivotTable)
    
    """
    fullPivotTable.fillna(0.0, inplace = True)
    fullPivotTable[labelGeneralTotal] = fullPivotTable[fullPivotTable.columns].sum(axis = 1) #Suma de los valores que se tienen fila por fila
    
    #Obtencion del total general (Suma de los valores que se tienen columna por columna)
    valuesSumTableByColumns = fullPivotTable.sum()
    valuesSumTableByColumns.name = labelGeneralTotal
    totalValuesList = valuesSumTableByColumns.tolist()
    valuesSumTableByColumns = pd.DataFrame({}, columns = fullPivotTable.columns, index = pd.Index([], name = indexColumn))
    valuesSumTableByColumns.loc[labelGeneralTotal] = totalValuesList
    fullPivotTable = pd.concat([fullPivotTable, valuesSumTableByColumns], axis = 0)
    """
    
    for column in fullPivotTable.columns:
        fullPivotTable[column] = [int(value) for value in fullPivotTable[column].tolist()]
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es 0
    cellsIndexesWithZeroValuesBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithZeroValuesBySelectedColumns[column] = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] == 0:
                cellsIndexesWithZeroValuesBySelectedColumns[column].append(rowIndex)
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el maximo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMaximumValueBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithMaximumValueBySelectedColumns[column] = []
        maximumValue = -1
        indexesListWithMaximumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] > maximumValue:
                maximumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMaximumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == maximumValue:
                indexesListWithMaximumValue.append(rowIndex)
        cellsIndexesWithMaximumValueBySelectedColumns[column] = indexesListWithMaximumValue
    
    #Obtencion de celdas al nivel de detalle principal, cuyo valor es el minmo en cada columna (cada valor posible en el nivel de detalle principal)
    cellsIndexesWithMinimumValueBySelectedColumns = {}
    for column in fullPivotTable.columns:
        cellsIndexesWithMinimumValueBySelectedColumns[column] = []
        minimumValue = float('inf')
        indexesListWithMinimumValue = []
        for rowIndex in range(len(fullPivotTable) - 1):
            if fullPivotTable[column].tolist()[rowIndex] < minimumValue and fullPivotTable[column].tolist()[rowIndex] != 0:
                minimumValue = fullPivotTable[column].tolist()[rowIndex]
                indexesListWithMinimumValue = [rowIndex]
            elif fullPivotTable[column].tolist()[rowIndex] == minimumValue:
                indexesListWithMinimumValue.append(rowIndex)
        cellsIndexesWithMinimumValueBySelectedColumns[column] = indexesListWithMinimumValue
    return fullPivotTable, cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns

def applyHypothesisTestForVariablesIndependence(frequencyTableWith2DetailLevels, indexColumn, valuesColumn, confidenceRatio = 0.95):
    expectedFrequencyTableWith2DetailLevels = frequencyTableWith2DetailLevels.copy()
    firstVariableNumberValues = len(frequencyTableWith2DetailLevels) - 1
    secondVariableNumberValues = len(frequencyTableWith2DetailLevels.columns) - 1
    numberTotalObservations = frequencyTableWith2DetailLevels.iloc[firstVariableNumberValues, secondVariableNumberValues]
    print("Se va a realizar una prueba de hipótesis de independencia de variables, para las variables", printColouredText(indexColumn, colorTextoVerde), "y", printColouredText(valuesColumn, colorTextoVerde), "con un valor de confianza de", printColouredText(confidenceRatio, colorTextoVerde))
    print("Se tiene un total de", printColouredText(numberTotalObservations, colorTextoVerde), "observaciones, la variable", printColouredText(indexColumn, colorTextoRojo), "toma", printColouredText(firstVariableNumberValues, colorTextoRojo), "valores posibles, y la variable", printColouredText(valuesColumn, colorTextoRojo), "toma", printColouredText(secondVariableNumberValues, colorTextoRojo), "valores posibles")
    
    #Construccion de la tabla de frecuencias esperadas
    for rowIndex in range(firstVariableNumberValues):
        for columnIndex in range(secondVariableNumberValues):
            expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex] = frequencyTableWith2DetailLevels.iloc[firstVariableNumberValues, columnIndex] * frequencyTableWith2DetailLevels.iloc[rowIndex, secondVariableNumberValues] / numberTotalObservations
    
    #Calculo de valores chi cuadrado de prueba y chi cuadrado critico
    numberFreedomDegrees = (firstVariableNumberValues - 1) * (secondVariableNumberValues - 1)
    chiSquaredCriticalValue = scipy.stats.chi2.ppf(confidenceRatio, numberFreedomDegrees)
    chiSquaredTestValue = 0.0
    for rowIndex in range(firstVariableNumberValues):
        for columnIndex in range(secondVariableNumberValues):
            chiSquaredTestValue += np.power(frequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex] - expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex], 2) / expectedFrequencyTableWith2DetailLevels.iloc[rowIndex, columnIndex]
    print("Se tiene(n)", printColouredText(numberFreedomDegrees, colorTextoVerde), "grado(s) de libertad, un valor estadístico crítico de", printColouredText(chiSquaredCriticalValue, colorTextoVerde), "y un valor estadístico de prueba de", printColouredText(chiSquaredTestValue, colorTextoVerde))
    
    #Se toma la decision de aceptar o rechazar la hipotesis de independencia de las 2 variables
    decisionConfirmation = True
    if chiSquaredCriticalValue < chiSquaredTestValue:
        print("Como el valor de prueba supera el valor crítico, por lo tanto se", printColouredText("rechaza", colorTextoRojo), "la hipótesis nula de independencia de las variables, es decir, se tiene dependencia entre las variables", printColouredText(indexColumn, colorTextoRojo), "y", printColouredText(valuesColumn, colorTextoRojo))
        decisionConfirmation = False
    else:
        print("Como el valor de prueba no supera el valor crítico, por lo tanto se", printColouredText("acepta", colorTextoRojo), "la hipótesis nula de independencia de las variables, es decir, se tiene independencia entre las variables", printColouredText(indexColumn, colorTextoRojo), "y", printColouredText(valuesColumn, colorTextoRojo))
    return decisionConfirmation, expectedFrequencyTableWith2DetailLevels

def plotFrequenciesProportionFor1DetailLevel(frequencyTableWith1DetailLevel, indexColumn, labelAbsoluteFrequency = "Absolute Frequency", labelRelativeFrequency = "Relative Frequency"):
    variableNumberValues = len(frequencyTableWith1DetailLevel) - 1
    
    #Obtencion de la gama de colores para las barras/sectores circulares
    barsColorsListInHexadecimal = None
    if variableNumberValues <= len(px.colors.qualitative.Pastel1):
        barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
        barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
        barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
    else:
        colorsMap = mpl.cm.get_cmap('turbo')
        barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = variableNumberValues))]
        barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
    
    plotTitle = "Proporción y Distribución de Frecuencias de\n" + indexColumn
    labelsList = [str(frequencyTableWith1DetailLevel.loc[frequencyTableWith1DetailLevel.index[index], labelAbsoluteFrequency]) + " (" + str(np.round(frequencyTableWith1DetailLevel.loc[frequencyTableWith1DetailLevel.index[index], labelRelativeFrequency] * 100, decimals = 3)) + " %)" for index in range(variableNumberValues)]
    plt.pie(frequencyTableWith1DetailLevel[labelAbsoluteFrequency].tolist()[:variableNumberValues], labels = labelsList, colors = barsColorsListInHexadecimal, wedgeprops = {"linewidth": 1.0, "edgecolor": "black"})
    plt.title(plotTitle)
    plt.legend(frequencyTableWith1DetailLevel.index[:variableNumberValues], title = indexColumn, bbox_to_anchor = (1.35, 0.5), loc = "center left")
    plt.tight_layout()

def plotFrequenciesDistributionFor2DetailLevels(frequencyTableWith2DetailLevels, indexColumn, valuesColumn, barWidth = 1.0, barOffset = 0.0, barPlotOriginalOrientation = True, labelFrequency = "Frequency"):
    firstVariableNumberValues = len(frequencyTableWith2DetailLevels) - 1
    secondVariableNumberValues = len(frequencyTableWith2DetailLevels.columns) - 1
    firstVariableValuesList = list(frequencyTableWith2DetailLevels.index)[:firstVariableNumberValues]
    secondVariableValuesList = list(frequencyTableWith2DetailLevels.columns)[:secondVariableNumberValues]
    
    #Calculo de alturas de las barras respecto a la variable indicada
    barsHeightsListsGroupedByFirstVariable = {value: list(frequencyTableWith2DetailLevels.loc[value, frequencyTableWith2DetailLevels.columns[:secondVariableNumberValues]]) for value in firstVariableValuesList}
    barsHeightsListsGroupedBySecondVariable = {value: list(frequencyTableWith2DetailLevels.loc[frequencyTableWith2DetailLevels.index[:firstVariableNumberValues], value]) for value in secondVariableValuesList}
    
    #Calculo de posiciones de las barras respecto a la variable indicada
    barsPositionsListsGroupedByFirstVariable = {value: [(index * (len(firstVariableValuesList) + 1) + firstVariableValuesList.index(value)) * barWidth + index * barOffset for index in list(np.arange(secondVariableNumberValues))] for value in firstVariableValuesList}
    barsPositionsListsGroupedBySecondVariable = {value: [(index * (len(secondVariableValuesList) + 1) + secondVariableValuesList.index(value)) * barWidth + index * barOffset for index in list(np.arange(firstVariableNumberValues))] for value in secondVariableValuesList}
    
    #Calculo de posiciones de las etiquetas de valores de la variable indicada
    firstAndLastBarsPositionsListGroupedBySecondVariable = {value: [(secondVariableValuesList.index(value) * (len(firstVariableValuesList) + 1)) * barWidth + secondVariableValuesList.index(value) * barOffset, ((secondVariableValuesList.index(value) + 1) * (len(firstVariableValuesList) + 1) - 1) * barWidth + secondVariableValuesList.index(value) * barOffset] for value in secondVariableValuesList}
    firstAndLastBarsPositionsListGroupedByFirstVariable = {value: [(firstVariableValuesList.index(value) * (len(secondVariableValuesList) + 1)) * barWidth + firstVariableValuesList.index(value) * barOffset, ((firstVariableValuesList.index(value) + 1) * (len(secondVariableValuesList) + 1) - 1) * barWidth + firstVariableValuesList.index(value) * barOffset] for value in firstVariableValuesList}
    barsLabelsListGroupedBySecondVariable = [np.mean(firstAndLastBarsPositionsListGroupedBySecondVariable[value]) for value in secondVariableValuesList]
    barsLabelsListGroupedByFirstVariable = [np.mean(firstAndLastBarsPositionsListGroupedByFirstVariable[value]) for value in firstVariableValuesList]
    
    #Obtencion de parametros para la grafica del diagrama de barras agrupadas
    barsColorsListInHexadecimal = None
    barsHeightsListsGroupedByIndicatedVariable = None
    barsPositionsListsGroupedByIndicatedVariable = None
    legendLabelsValuesList = None
    plotXLabelsPositionsList = None
    plotXLabelsValuesList = None
    plotTitle = None
    plotXLabel = None
    legendTitle = None
    if barPlotOriginalOrientation == True:
        #Obtencion de la gama de colores para las barras/sectores circulares
        if secondVariableNumberValues <= len(px.colors.qualitative.Pastel1):
            barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
            barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
        else:
            colorsMap = mpl.cm.get_cmap('turbo')
            barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = secondVariableNumberValues))]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
        
        barsHeightsListsGroupedByIndicatedVariable = barsHeightsListsGroupedBySecondVariable
        barsPositionsListsGroupedByIndicatedVariable = barsPositionsListsGroupedBySecondVariable
        legendLabelsValuesList = secondVariableValuesList
        plotXLabelsPositionsList = barsLabelsListGroupedByFirstVariable
        plotXLabelsValuesList = firstVariableValuesList
        plotTitle = "Distribución de Frecuencias de " + valuesColumn + "\ncon respecto a " + indexColumn
        plotXLabel = valuesColumn
        legendTitle = indexColumn
    else:
        #Obtencion de la gama de colores para las barras
        if firstVariableNumberValues <= len(px.colors.qualitative.Pastel1):
            barsColorsListInRGB = [color.replace("rgb(", "").replace(")", "") for color in px.colors.qualitative.Pastel1]
            barsColorsListInRGB = [color.split(",") for color in barsColorsListInRGB]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(component[0]), int(component[1]), int(component[2])) for component in barsColorsListInRGB]
        else:
            colorsMap = mpl.cm.get_cmap('turbo')
            barsColorsListInRGB = [colorsMap(number) for number in list(np.linspace(0.0, 1.0, num = firstVariableNumberValues))]
            barsColorsListInHexadecimal = ["#{:02x}{:02x}{:02x}".format(int(255 * component[0]), int(255 * component[1]), int(255 * component[2])) for component in barsColorsListInRGB]
        
        barsHeightsListsGroupedByIndicatedVariable = barsHeightsListsGroupedByFirstVariable
        barsPositionsListsGroupedByIndicatedVariable = barsPositionsListsGroupedByFirstVariable
        legendLabelsValuesList = firstVariableValuesList
        plotXLabelsPositionsList = barsLabelsListGroupedBySecondVariable
        plotXLabelsValuesList = secondVariableValuesList
        plotTitle = "Distribución de Frecuencias de " + indexColumn + "\ncon respecto a " + valuesColumn
        plotXLabel = indexColumn
        legendTitle = valuesColumn
    
    plt.grid(True, zorder = 0)
    for value in legendLabelsValuesList:
        plt.bar(barsPositionsListsGroupedByIndicatedVariable[value], barsHeightsListsGroupedByIndicatedVariable[value], color = barsColorsListInHexadecimal[legendLabelsValuesList.index(value)], edgecolor = "black", width = barWidth, label = value, zorder = 3)
    plt.title(plotTitle)
    plt.xlabel(plotXLabel)
    plt.ylabel("Frecuencia")
    plt.xticks(plotXLabelsPositionsList, plotXLabelsValuesList)
    plt.legend(title = legendTitle, bbox_to_anchor = (1.0, 0.5), loc = "center left", borderaxespad = 1.0)
    plt.tight_layout()

In [4]:
structureDatasetFile = pd.read_csv(nombreArchivoEstructuraDataset, sep = ";", decimal = ".")
structureDatasetFile["Additional Tags"].fillna("", inplace = True)
fieldNamesList = structureDatasetFile["Field Name"].tolist()
additionalTagsList = structureDatasetFile["Additional Tags"].tolist()
respectiveVariableKindsList = structureDatasetFile["Variable Kind"].tolist()

print("Se ha extraído la información del archivo", printColouredText(nombreArchivoEstructuraDataset, colorTextoRojo))
print("Se tiene(n)", printColouredText(str(len(structureDatasetFile) - 1) + " variable(s) predictora(s)", colorTextoVerde))

Se ha extraído la información del archivo [48;2;255;255;255m[38;2;176;58;46m./Estructura del Dataset.txt[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m21 variable(s) predictora(s)[38;2;0;0;0m[48;2;255;255;255m


In [5]:
display(structureDatasetFile)

Unnamed: 0,Field Number,Field Name,Variable Kind,Number of Values per Observation,Additional Tags
0,1,titleValue,Continuous Numerical,Single Value,
1,2,colorMode,Categorical,Single Value,
2,3,qualifyingSystem,Categorical,Single Value,
3,4,aspectRatio,Continuous Numerical,Single Value,
4,5,releaseYear,Discrete Numerical,Single Value,Grouped
5,6,budget,Continuous Numerical,Single Value,
6,7,movieFacebookNumberLikes,Discrete Numerical,Single Value,Grouped
7,8,numberCriticsForReview,Discrete Numerical,Single Value,Grouped
8,9,numberUsersReviews,Discrete Numerical,Single Value,Grouped
9,10,numberUsersVotes,Discrete Numerical,Single Value,Grouped


In [6]:
datasetFile = pd.read_csv(nombreArchivoDataset, sep = ",", decimal = ".")

print("Se ha extraído la información del archivo", printColouredText(nombreArchivoDataset, colorTextoRojo))
print("Se tiene(n)", printColouredText(str(len(datasetFile)) + " registros(s)", colorTextoVerde))

Se ha extraído la información del archivo [48;2;255;255;255m[38;2;176;58;46m./IMDBMovies10ClassesIncompleto.csv[38;2;0;0;0m[48;2;255;255;255m
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m5043 registros(s)[38;2;0;0;0m[48;2;255;255;255m


In [7]:
datasetFile.info()
datasetFile[nombreColumnaDuplicadaSalida] = datasetFile[nombreColumnaSalida].tolist()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5043 entries, 0 to 5042
Data columns (total 22 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   titleValue                      5043 non-null   float64
 1   colorMode                       5043 non-null   object 
 2   qualifyingSystem                5043 non-null   object 
 3   aspectRatio                     5043 non-null   float64
 4   releaseYear                     5043 non-null   int64  
 5   budget                          5043 non-null   int64  
 6   movieFacebookNumberLikes        5043 non-null   int64  
 7   numberCriticsForReview          5043 non-null   int64  
 8   numberUsersReviews              5043 non-null   int64  
 9   numberUsersVotes                5043 non-null   int64  
 10  numberFacesInPoster             5043 non-null   int64  
 11  duration                        5043 non-null   int64  
 12  directorFacebookNumberLikes     50

In [8]:
%matplotlib notebook

tablaFrecuenciasColumnaSalida, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor1Variable(datasetFile, nombreColumnaSalida, labelAbsoluteFrequency = etiquetaFrecuenciaAbsoluta, labelRelativeFrequency = etiquetaFrecuenciaRelativa, labelGeneralTotal = etiquetaTotalGeneral)
display(tablaFrecuenciasColumnaSalida.style.apply(highlightFrequenciesTableFor1DetailLevel, axis = None))
numeroValoresColumnaSalida = len(tablaFrecuenciasColumnaSalida) - 1

plt.figure(num = 1, figsize = (9.5, 4.5))
plotFrequenciesProportionFor1DetailLevel(tablaFrecuenciasColumnaSalida, nombreColumnaSalida, labelAbsoluteFrequency = etiquetaFrecuenciaAbsoluta, labelRelativeFrequency = etiquetaFrecuenciaRelativa)
plt.show()

Unnamed: 0_level_0,Frecuencia Absoluta,Frecuencia Relativa
correctMovieExpectations,Unnamed: 1_level_1,Unnamed: 2_level_1
1 Punto de Rating,16,0.003173
10 Puntos de Rating,21,0.004164
2 Puntos de Rating,38,0.007535
3 Puntos de Rating,102,0.020226
4 Puntos de Rating,222,0.044021
5 Puntos de Rating,579,0.114813
6 Puntos de Rating,1158,0.229625
7 Puntos de Rating,1515,0.300416
8 Puntos de Rating,1085,0.21515
9 Puntos de Rating,307,0.060876


<IPython.core.display.Javascript object>

In [9]:
pivotTablesList = {}

variableIndex = 1
for nombreColumna in datasetFile.columns:
    if nombreColumna != nombreColumnaSalida.replace("_", " ").replace("-", " ") and nombreColumna != nombreColumnaDuplicadaSalida.replace("_", " ").replace("-", " "):
        columnIndex = fieldNamesList.index(nombreColumna)
        variableKind = respectiveVariableKindsList[columnIndex]
        variableAdditionalTags = additionalTagsList[columnIndex].split(caracterSeparador)
        
        if variableKind == TipoVariable.variableCategorica.value or variableKind == TipoVariable.variableCuantitativaDiscreta.value:
            if EtiquetaAdicionalVariables.variableDatosAgrupados.value in variableAdditionalTags:
                print("pivotTablesList[\"" + nombreColumna + "\"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2VariablesWithMainGroupping(datasetFile, nombreColumnaDuplicadaSalida, \"" + nombreColumna.replace(" ", "_").replace("-", "_") + "\", nombreColumnaSalida, numberIntervalsRule = CantidadIntervalosAgrupacion.reglaPrincipal.value, integerGrouppingValues = True)")
                print("display(pivotTablesList[\"" + nombreColumna + "\"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))\n")
                print("variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList[\"" + nombreColumna + "\"], \"" + nombreColumna + "\", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)")
                print("if variablesIndependenceDecisionConfirmation == True:")
                print("\toutputIndependentVariableslist.append(\"" + nombreColumna + "\")")
                print("else:")
                print("\toutputDependentVariableslist.append(\"" + nombreColumna + "\")\n")
                print("plt.figure(num = " + str(variableIndex + 1) + ", figsize = (9.5, 10))")
                print("plotFrequenciesDistributionFor2DetailLevels(pivotTablesList[\"" + nombreColumna + "\"], nombreColumnaSalida, \"" + nombreColumna + "\", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)")
                print("plt.show()")
                print("\n" + printColouredText("-" * 100, colorTextoRojo) + "\n")
            else:
                print("pivotTablesList[\"" + nombreColumna + "\"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2DiscreteVariables(datasetFile, nombreColumnaDuplicadaSalida, \"" + nombreColumna.replace(" ", "_").replace("-", "_") + "\", nombreColumnaSalida)")
                print("display(pivotTablesList[\"" + nombreColumna + "\"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))\n")
                print("variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList[\"" + nombreColumna + "\"], \"" + nombreColumna + "\", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)")
                print("if variablesIndependenceDecisionConfirmation == True:")
                print("\toutputIndependentVariableslist.append(\"" + nombreColumna + "\")")
                print("else:")
                print("\toutputDependentVariableslist.append(\"" + nombreColumna + "\")\n")
                print("plt.figure(num = " + str(variableIndex + 1) + ", figsize = (9.5, 10))")
                print("plotFrequenciesDistributionFor2DetailLevels(pivotTablesList[\"" + nombreColumna + "\"], nombreColumnaSalida, \"" + nombreColumna + "\", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)")
                print("plt.show()")
                print("\n" + printColouredText("-" * 100, colorTextoRojo) + "\n")
        #else:
            #print("frequencyTableWithBothDetailLevels, pivotTablesList[\"" + nombreColumna + "\"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2DiscreteVariables(labeledDatasetFile, nombreColumnaSalida, \"" + nombreColumna.replace(" ", "_").replace("-", "_") + "\", nombreColumnaDuplicadaSalida, nombreColumnaFase)")
        variableIndex += 1

outputIndependentVariableslist = []
outputDependentVariableslist = []

pivotTablesList["colorMode"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2DiscreteVariables(datasetFile, nombreColumnaDuplicadaSalida, "colorMode", nombreColumnaSalida)
display(pivotTablesList["colorMode"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList["colorMode"], "colorMode", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
	outputIndependentVariableslist.append("colorMode")
else:
	outputDependentVariableslist.append("colorMode")

plt.figure(num = 3, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(pivotTablesList["colorMode"], nombreColumnaSalida, "colorMode", barWidth = 1.0, barOffset = 3.5,

In [10]:
pivotTablesList["colorMode"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2DiscreteVariables(datasetFile, nombreColumnaDuplicadaSalida, "colorMode", nombreColumnaSalida)
display(pivotTablesList["colorMode"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList["colorMode"], "colorMode", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
    outputIndependentVariableslist.append("colorMode")
else:
    outputDependentVariableslist.append("colorMode")

plt.figure(num = 3, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(pivotTablesList["colorMode"], nombreColumnaSalida, "colorMode", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)
plt.show()

correctMovieExpectations,1 Punto de Rating,10 Puntos de Rating,2 Puntos de Rating,3 Puntos de Rating,4 Puntos de Rating,5 Puntos de Rating,6 Puntos de Rating,7 Puntos de Rating,8 Puntos de Rating,9 Puntos de Rating,General Total
colorMode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Black and White,0,3,0,0,3,7,27,52,67,50,209
Color,16,16,37,102,217,568,1126,1459,1017,257,4815
Not Reported,0,2,1,0,2,4,5,4,1,0,19
General Total,16,21,38,102,222,579,1158,1515,1085,307,5043


Se va a realizar una prueba de hipótesis de independencia de variables, para las variables [48;2;255;255;255m[38;2;35;155;86mcolorMode[38;2;0;0;0m[48;2;255;255;255m y [48;2;255;255;255m[38;2;35;155;86mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m con un valor de confianza de [48;2;255;255;255m[38;2;35;155;86m0.99[38;2;0;0;0m[48;2;255;255;255m
Se tiene un total de [48;2;255;255;255m[38;2;35;155;86m5043[38;2;0;0;0m[48;2;255;255;255m observaciones, la variable [48;2;255;255;255m[38;2;176;58;46mcolorMode[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m3[38;2;0;0;0m[48;2;255;255;255m valores posibles, y la variable [48;2;255;255;255m[38;2;176;58;46mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m10[38;2;0;0;0m[48;2;255;255;255m valores posibles
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m18[38;2;0;0;0m[48;2;255;255;255m grado(s) de libertad, un valor estadístico crítico de [48;

<IPython.core.display.Javascript object>

In [11]:
pivotTablesList["qualifyingSystem"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2DiscreteVariables(datasetFile, nombreColumnaDuplicadaSalida, "qualifyingSystem", nombreColumnaSalida)
display(pivotTablesList["qualifyingSystem"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))

variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList["qualifyingSystem"], "qualifyingSystem", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
    outputIndependentVariableslist.append("qualifyingSystem")
else:
    outputDependentVariableslist.append("qualifyingSystem")

plt.figure(num = 4, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(pivotTablesList["qualifyingSystem"], nombreColumnaSalida, "qualifyingSystem", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)
plt.show()

correctMovieExpectations,1 Punto de Rating,10 Puntos de Rating,2 Puntos de Rating,3 Puntos de Rating,4 Puntos de Rating,5 Puntos de Rating,6 Puntos de Rating,7 Puntos de Rating,8 Puntos de Rating,9 Puntos de Rating,General Total
qualifyingSystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Approved,0,1,0,0,1,0,2,24,28,8,64
G,1,0,4,1,6,8,24,25,25,18,112
M,0,0,0,0,0,0,2,2,0,1,5
NC-17,0,0,0,0,1,0,2,1,3,0,7
Not Rated,1,3,5,17,25,56,77,115,136,46,481
PG,4,1,5,20,36,111,146,205,139,40,707
PG-13,7,5,11,36,73,194,408,434,242,51,1461
R,3,6,13,26,79,203,490,700,478,120,2118
TV-14,0,0,0,2,0,1,2,3,14,8,30
TV-G,0,0,0,0,0,2,2,1,2,3,10


Se va a realizar una prueba de hipótesis de independencia de variables, para las variables [48;2;255;255;255m[38;2;35;155;86mqualifyingSystem[38;2;0;0;0m[48;2;255;255;255m y [48;2;255;255;255m[38;2;35;155;86mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m con un valor de confianza de [48;2;255;255;255m[38;2;35;155;86m0.99[38;2;0;0;0m[48;2;255;255;255m
Se tiene un total de [48;2;255;255;255m[38;2;35;155;86m5043[38;2;0;0;0m[48;2;255;255;255m observaciones, la variable [48;2;255;255;255m[38;2;176;58;46mqualifyingSystem[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m15[38;2;0;0;0m[48;2;255;255;255m valores posibles, y la variable [48;2;255;255;255m[38;2;176;58;46mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m toma [48;2;255;255;255m[38;2;176;58;46m10[38;2;0;0;0m[48;2;255;255;255m valores posibles
Se tiene(n) [48;2;255;255;255m[38;2;35;155;86m126[38;2;0;0;0m[48;2;255;255;255m grado(s) de libertad, un valor estadístico 

<IPython.core.display.Javascript object>

In [12]:
pivotTablesList["releaseYear"], cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns = buildFrequenciesTableFor2VariablesWithMainGroupping(datasetFile, nombreColumnaDuplicadaSalida, "releaseYear", nombreColumnaSalida, numberIntervalsRule = CantidadIntervalosAgrupacion.reglaPrincipal.value, integerGrouppingValues = True)
#print(cellsIndexesWithZeroValuesBySelectedColumns, cellsIndexesWithMaximumValueBySelectedColumns, cellsIndexesWithMinimumValueBySelectedColumns)

display(pivotTablesList["releaseYear"].style.apply(highlightFrequenciesTableFor2DetailLevels, axis = None))

"""
variablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList["releaseYear"], "releaseYear", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)
if variablesIndependenceDecisionConfirmation == True:
    outputIndependentVariableslist.append("releaseYear")
else:
    outputDependentVariableslist.append("releaseYear")

plt.figure(num = 6, figsize = (9.5, 10))
plotFrequenciesDistributionFor2DetailLevels(pivotTablesList["releaseYear"], nombreColumnaSalida, "releaseYear", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)
plt.show()
"""

correctMovieExpectations,1 Punto de Rating,10 Puntos de Rating,2 Puntos de Rating,3 Puntos de Rating,4 Puntos de Rating,5 Puntos de Rating,6 Puntos de Rating,7 Puntos de Rating,8 Puntos de Rating,9 Puntos de Rating
releaseYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"[1916, 1924)",0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
"[1924, 1932)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,3.0
"[1932, 1939)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,4.0,3.0
"[1939, 1947)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,6.0,8.0
"[1947, 1955)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,9.0,10.0,5.0
"[1955, 1963)",0.0,1.0,0.0,0.0,1.0,0.0,2.0,1.0,11.0,11.0
"[1963, 1970)",0.0,1.0,0.0,0.0,1.0,1.0,4.0,18.0,17.0,15.0
"[1970, 1978)",0.0,2.0,0.0,1.0,0.0,3.0,10.0,20.0,28.0,18.0
"[1978, 1986)",0.0,1.0,1.0,2.0,10.0,14.0,33.0,57.0,61.0,22.0
"[1986, 1994)",0.0,1.0,2.0,4.0,8.0,24.0,57.0,71.0,77.0,21.0


correctMovieExpectations,1 Punto de Rating,10 Puntos de Rating,2 Puntos de Rating,3 Puntos de Rating,4 Puntos de Rating,5 Puntos de Rating,6 Puntos de Rating,7 Puntos de Rating,8 Puntos de Rating,9 Puntos de Rating
releaseYear,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
"[1916, 1924)",0,0,0,0,0,1,0,0,0,1
"[1924, 1932)",0,0,0,0,0,0,1,0,1,3
"[1932, 1939)",0,0,0,0,0,0,0,4,4,3
"[1939, 1947)",0,0,0,0,0,0,0,6,6,8
"[1947, 1955)",0,0,0,0,0,0,1,9,10,5
"[1955, 1963)",0,1,0,0,1,0,2,1,11,11
"[1963, 1970)",0,1,0,0,1,1,4,18,17,15
"[1970, 1978)",0,2,0,1,0,3,10,20,28,18
"[1978, 1986)",0,1,1,2,10,14,33,57,61,22
"[1986, 1994)",0,1,2,4,8,24,57,71,77,21


'\nvariablesIndependenceDecisionConfirmation, expectedFrequencyTableWithBothDetailLevels = applyHypothesisTestForVariablesIndependence(pivotTablesList["releaseYear"], "releaseYear", nombreColumnaSalida, confidenceRatio = confidenceRatioForIndependeceHypothesisTest)\nif variablesIndependenceDecisionConfirmation == True:\n    outputIndependentVariableslist.append("releaseYear")\nelse:\n    outputDependentVariableslist.append("releaseYear")\n\nplt.figure(num = 6, figsize = (9.5, 10))\nplotFrequenciesDistributionFor2DetailLevels(pivotTablesList["releaseYear"], nombreColumnaSalida, "releaseYear", barWidth = 1.0, barOffset = 3.5, barPlotOriginalOrientation = False)\nplt.show()\n'

In [13]:
if len(outputIndependentVariableslist) == 0:
    print("No se tienen variables independientes respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoVerde))
else:
    print("La(s) variable(s) independiente(s) respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoVerde), "son:")
    for indice in range(len(outputIndependentVariableslist)):
        print(str(indice + 1) + ":", printColouredText(outputIndependentVariableslist[indice], colorTextoVerde))
    print("\nSe tiene(n)", printColouredText(str(len(outputIndependentVariableslist)), colorTextoVerde), "variable(s) independiente(s) respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoVerde))

No se tienen variables independientes respecto a la variable de salida [48;2;255;255;255m[38;2;35;155;86mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m


In [14]:
if len(outputDependentVariableslist) == 0:
    print("No se tienen variables dependientes respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoRojo))
else:
    print("La(s) variable(s) dependiente(s) respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoRojo), "son:")
    for indice in range(len(outputDependentVariableslist)):
        print(str(indice + 1) + ":", printColouredText(outputDependentVariableslist[indice], colorTextoRojo))
    print("\nSe tiene(n)", printColouredText(str(len(outputDependentVariableslist)), colorTextoRojo), "variable(s) dependiente(s) respecto a la variable de salida", printColouredText(nombreColumnaSalida, colorTextoRojo))

La(s) variable(s) dependiente(s) respecto a la variable de salida [48;2;255;255;255m[38;2;176;58;46mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m son:
1: [48;2;255;255;255m[38;2;176;58;46mcolorMode[38;2;0;0;0m[48;2;255;255;255m
2: [48;2;255;255;255m[38;2;176;58;46mqualifyingSystem[38;2;0;0;0m[48;2;255;255;255m

Se tiene(n) [48;2;255;255;255m[38;2;176;58;46m2[38;2;0;0;0m[48;2;255;255;255m variable(s) dependiente(s) respecto a la variable de salida [48;2;255;255;255m[38;2;176;58;46mcorrectMovieExpectations[38;2;0;0;0m[48;2;255;255;255m
