# TFG analysis

In [None]:
# Import functions
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import fisher_exact
import rpy2.robjects as ro
from rpy2.robjects import pandas2ri, default_converter
from rpy2.robjects.conversion import localconverter
from rpy2.robjects.packages import importr


## Read data and process frequencies

In [None]:
# Read file TFG_data_GitHub.csv
df0 = pd.read_csv('TFG_data_GitHub.csv', header=1, skip_blank_lines=True)

In [None]:
df0

## Clean the table (remove blanks, "/", numbers, etc)

In [None]:
quadrants_to_search = ['UL', 'UM', 'UR', 'LL', 'LM', 'LR']
chars_to_search = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L']

In [None]:
df = df0.copy()
for quadrant in quadrants_to_search:
    # replace empty strings with NaN
    df[quadrant] = df[quadrant].replace('', pd.NA)
    # replace "/" with NaN
    df[quadrant] = df[quadrant].replace('/', pd.NA)
    # remove numbers in the cells (keep only letters)
    df[quadrant] = df[quadrant].str.replace(r'\d+', '', regex=True)
    # remove spaces in the cells 
    df[quadrant] = df[quadrant].str.replace(r'\s+', '', regex=True)
#print cleaned table
print(df.iloc[0:20, :])

### Eliminate individuals with NO DATA (LG019)

In [None]:
# remove row if all quadrants are NaN
df = df.dropna(subset=quadrants_to_search, how='all')
print(df.iloc[0:30, :])

# Get statistics about age, gender and etnicity

## Gender

In [None]:
# calculate the gender percentage: SEXO=='F' or SEXO=='M'
# count how many women and men there are in the dataset
total_f = df[df['SEXO'] == 'F'].shape[0]
total_m = df[df['SEXO'] == 'M'].shape[0]
print(f"Total number of women: {total_f}")
print(f"Total number of men: {total_m}")
percent_f = (total_f / (total_f + total_m)) * 100
percent_m = (total_m / (total_f + total_m)) * 100
# print the values in percentages
print(f"Percentage of men: {percent_m:.2f}%")
print(f"Percentage of women: {percent_f:.2f}%")


## Age

In [None]:
# calculate the mean of the ages ("EDAD")
mean_age = df['EDAD'].mean()
print(f"Mean age: {mean_age:.2f}")
# calculate distribution of ages in intervals [<30, 30-50,>50]
age_distribution = pd.cut(df['EDAD'], bins=[-np.inf, 30, 50, np.inf], labels=['<30', '30-50', '>50'])
age_distribution_counts = age_distribution.value_counts()
percentage_of_total = age_distribution_counts / age_distribution_counts.sum() * 100
# print age_group, counts and percentages
print("Age distribution:")
print("Age group    Count       (Percentage)")
print("=====================================")
for age_group, count in age_distribution_counts.items():
    percentage = percentage_of_total[age_group]
    print(f"{age_group}           {count}      ({percentage:.2f}%)")

In [None]:
# calculate age distribution by gender (SEXO=='F' or SEXO=='M')
age_distribution_f = df[df['SEXO'] == 'F']['EDAD']
age_distribution_m = df[df['SEXO'] == 'M']['EDAD']
age_distribution_f = pd.cut(age_distribution_f, bins=[-np.inf, 30, 50, np.inf], labels=['<30', '30-50', '>50'])
age_distribution_m = pd.cut(age_distribution_m, bins=[-np.inf, 30, 50, np.inf], labels=['<30', '30-50', '>50'])
age_distribution_f_counts = age_distribution_f.value_counts()
age_distribution_m_counts = age_distribution_m.value_counts()
age_distribution_f_percentage = age_distribution_f_counts / age_distribution_f_counts.sum() * 100
age_distribution_m_percentage = age_distribution_m_counts / age_distribution_m_counts.sum() * 100
# print age_group, counts and percentages
print("\nAge distribution (SEXO=F):")
print("Age group    Count       (Percentage)")
print("=====================================")
for age_group, count in age_distribution_f_counts.items():
    percentage = age_distribution_f_percentage[age_group]
    print(f"{age_group}           {count}      ({percentage:.2f}%)")
print("\nAge distribution (SEXO=M):")
print("Age group    Count       (Percentage)")
print("=====================================")
for age_group, count in age_distribution_m_counts.items():
    percentage = age_distribution_m_percentage[age_group]
    print(f"{age_group}           {count}      ({percentage:.2f}%)")

## A. Genética

In [None]:
# get values of the "A. GENÉTICA" column
origin_values = df['A. GENÉTICA'].unique()
print("\nUnique values of A. GENÉTICA:")
total_origin = dict()
for value in origin_values:
    # get number of occurrences of each value
    total_origin[value] = df['A. GENÉTICA'].value_counts().get(value, 0)
    # print the value and its count
    print(f"   {value} ({total_origin[value]} occurrences)")
print("\nDistribution of A. GENÉTICA:")
origin_distribution = df['A. GENÉTICA'].value_counts(normalize=True) * 100
print(origin_distribution)

In [None]:
# A. Genética by gender (SEXO=='F' or SEXO=='M')
origin_distribution_f = df[df['SEXO'] == 'F']['A. GENÉTICA'].value_counts(normalize=True) * 100
origin_distribution_m = df[df['SEXO'] == 'M']['A. GENÉTICA'].value_counts(normalize=True) * 100
origin_distribution_f_counts = df[df['SEXO'] == 'F']['A. GENÉTICA'].value_counts()
origin_distribution_m_counts = df[df['SEXO'] == 'M']['A. GENÉTICA'].value_counts()
# print A.Genética, counts and percentages
print("\nA. GENÉTICA distribution (SEXO=F):")
print("A. GENÉTICA    Count       (Percentage)")
print("=====================================")
for origin, count in origin_distribution_f_counts.items():
    percentage = origin_distribution_f[origin]
    print(f"{origin}           {count}      ({percentage:.2f}%)")
print("\nA. GENÉTICA distribution (SEXO=M):")
print("A. GENÉTICA    Count       (Percentage)")
print("=====================================")
for origin, count in origin_distribution_m_counts.items():
    percentage = origin_distribution_m[origin]
    print(f"{origin}           {count}      ({percentage:.2f}%)")    

## Get number of repeated cells


In [None]:
# Foreach quadrant (columns) calculate if there are repeated cells (numbers are not relevant)
for quadrant in quadrants_to_search:
    print(f"\nChecking quadrant {quadrant}:")
    # clean cells (keep only characters and remove numbers)
    # remove empty strings
    cells = df[quadrant].copy()
    
    # sort each cell alphabetically
    cells = cells.apply(lambda x: ''.join(sorted(set(x))) if pd.notna(x) else x)
    
    # Count equal cells
    cells_counts = cells.value_counts()
    total_cells = len(cells.dropna())
    # Print counts of unique cells and their percentage
    print(f"Total cells in quadrant {quadrant} (not counting '/' cells): {total_cells}")
    # Print counts of repeated cells
    repeated_cells = cells_counts[cells_counts > 1]
    if not repeated_cells.empty:
        percent_repeated = (repeated_cells / total_cells) * 100
        #print(f"Repeated cells in quadrant {quadrant} (count and percentage):")
        print(" Cuadrante    Patron      Nº.Individuos     Porcentaje" )
        for cell, count in repeated_cells.items():
            # pretty print tabulated information
            print(f"    {quadrant}        {cell}               {count:<}          {percent_repeated[cell]:.2f}%")
    else:
        print(f"No repeated cells found in quadrant {quadrant}.")

In [None]:
# check if there are rows with the same sorted values in all quadrants

# Create a copy of df with only the quadrants to search, and sort each cell alphabetically with unique letters
df_sorted = df[quadrants_to_search].map(lambda x: ''.join(sorted(set(x))) if pd.notna(x) else x)

# Find duplicated rows based on all quadrants
duplicates = df_sorted.duplicated(keep=False)

if duplicates.any():
    print("\nRows with the same sorted values in all quadrants:")
    print(df[duplicates])
else:
    print("\nNo rows with the same sorted values in all quadrants found.")

## Get Table of character distribution (by SEXO)

In [None]:
# for each character, calcuate number of times it appears for men (SEXO='M') and women (SEXO='F')
for sexo in ['F', 'M']:
    lista_filas = []
    # count the number of times the character appears in each character
    # and save the quadrant where it appears the most 
    for char in chars_to_search:
        max_times_char_quadrant = 0
        quadrant_for_char_max = ''
        for col in quadrants_to_search:
            times_char_quadrant = df[df['SEXO'] == sexo][col].str.count(char).sum()
            if times_char_quadrant > max_times_char_quadrant:
                max_times_char_quadrant = times_char_quadrant
                quadrant_for_char_max = col
                # total number of members of sexo in the quadrant (dropping NA values)
                total_sexo = len(df[df['SEXO'] == sexo][col].dropna())
        # Add a row to the table with the character, the number of times it appears for women and for men
        percent_sexo = f"{(max_times_char_quadrant / total_sexo) * 100:.2f}%"
        # if the character is not in the quadrant, set the percentage to 0%
        if max_times_char_quadrant == 0:
            percent_sexo = '0.00%'
        nueva_linea = [char, max_times_char_quadrant, percent_sexo, quadrant_for_char_max, total_sexo]
        lista_filas.append(nueva_linea)
    # print the table
    tabla_repeticiones = pd.DataFrame(columns=['Caracter', 'No.Individuals', 'Percent', 'Area', 'Total_SEXO_en_area'], data=lista_filas,index=None)
    # Sort the table by the number of times the character appears
    tabla_repeticiones = tabla_repeticiones.sort_values(by='No.Individuals', ascending=False)
    # print the table
    print(f"\nCharacter repetition table for SEXO={sexo}:")
    print(tabla_repeticiones.to_string(index=False))

## Get Table of character distribution by A. GENETICA

In [None]:
# for each character calculate number of times it appears for A. GENÉTICA values
for origin in total_origin.keys():
    lista_filas = []
    # Count the number of times the character appears in each quadrant
    # and store the quadrant where it appears most frequently 
    for char in chars_to_search:
        max_times_char_quadrant = 0
        quadrant_for_char_max = ''
        for col in quadrants_to_search:
            times_char_quadrant = df[df['A. GENÉTICA'] == origin][col].str.count(char).sum()
            if times_char_quadrant > max_times_char_quadrant:
                max_times_char_quadrant = times_char_quadrant
                quadrant_for_char_max = col
                total_origin_inquad = len(df[df['A. GENÉTICA'] == origin][col].dropna())
        # Add a row to the table with the character and the number of times it appears for each genetic ancestry
        percent_origin = f"{(max_times_char_quadrant / total_origin_inquad) * 100:.2f}%"
        # if the character is not in the quadrant, set the percentage to 0%
        if max_times_char_quadrant == 0:
            percent_origin = '0.00%'
        
        nueva_linea = [char, max_times_char_quadrant, percent_origin, quadrant_for_char_max, total_origin_inquad]
        lista_filas.append(nueva_linea)
    # print the table
    tabla_repeticiones = pd.DataFrame(columns=['Caracter', 'No.Individuals', 'Percentage', 'Area',''], data=lista_filas,index=None)
    # Sort the table by the number of times the character appears
    tabla_repeticiones = tabla_repeticiones.sort_values(by='No.Individuals', ascending=False)
    # print the table
    print(f"\nCharacter repetition table for A. GENÉTICA={origin}:")
    print(tabla_repeticiones.to_string(index=False))

## Do Statistical analysis (independence fisher test) on SEXO

### Analysis for each quadrant

In [None]:
# count how many times a given character appears in each quadrant    
for col in quadrants_to_search:
    print("=======================================================================================================")
    print(f"In Quadrant:{col}:")
    print("=======================================================================================================")
    max_f = 0
    max_m = 0
    total_f_inquad = df[df['SEXO'] == 'F'][col].dropna().shape[0]
    total_m_inquad = df[df['SEXO'] == 'M'][col].dropna().shape[0]
    for char in chars_to_search:
        #print("   ===============")
        #print(f"   Character: {char}")
        #print("   ===============")
        times = df[col].str.count(char).sum()
        times_sexo_f = df[df['SEXO'] == 'F'][col].str.count(char).sum()
        percent_f = times_sexo_f / total_f_inquad * 100
        times_sexo_m = df[df['SEXO'] == 'M'][col].str.count(char).sum()
        percent_m = times_sexo_m / total_m_inquad * 100
        if times_sexo_f > max_f:
            max_f = times_sexo_f
            char_f = char
            largest_percent_f = percent_f
        if times_sexo_m > max_m:
            max_m = times_sexo_m
            char_m = char
            largest_percent_m = percent_m
        print(f" SEXO    CHARACTER   TIMES   PERCENT  TOTAL_IN_QUAD")
        print(f"======  =========== =======  =======  =============")
        print(f"  F         {char}          {times_sexo_f}     {percent_f:.2f}%     {total_f_inquad}")
        print(f"  M         {char}          {times_sexo_m}     {percent_m:.2f}%     {total_m_inquad}")

        #print(f"   *** character {char} appears {times:2d} times ({times_sexo_f} times for women and {times_sexo_m} times for men)")
        #print(f"                                   ({percent_f:.2f}% of women and {percent_m:.2f}% of men)")
        if times_sexo_f == 0 and times_sexo_m == 0:
            continue

        
        # do an independence fisher test
        # ===============================
        table = pd.crosstab(df['SEXO'], df[col].str.count(char))
        print(f"   Observed values:")
        print("   ---------------")
        print(table)
        # save table to numpy array
        table_np = table.to_numpy()
        
        # do the fisher test
        _, p_value = fisher_exact(table_np)
        print(f"   Fisher's exact test p-value: {p_value:.4f}")
        if p_value < 0.05:
            print("   --------------------------------------------------------------------------------------")
            print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on SEXO (p-value < 0.05)")
            print("   --------------------------------------------------------------------------------------")

        else:
            print("          ---------------------------------------------------------------------------------------------")
            print(f"          In quadrant {col} there is no evidence that character {char} depends on SEXO (p-value >= 0.05)")
            print("          -------------------------------------------------------------------------------------------")

    print(f"       ** Most common character for women is {char_f} with {max_f} times ({largest_percent_f:.2f}%)")
    print(f"       ** Most common character for men is {char_m} with {max_m} times ({largest_percent_m:.2f}%)")



### Analysis for all UPPER QUADRANTS (UL, UM, UR)

#### Create a new column (UPPER) for UL+UM+UR  
We are eliminating repetitions in characters:    
  UL   |   UM   |   UR               | UPPER
| ---- | ------ | ------------------ |   -----
3C 3H  |    G   |  1A 6C 1E 3H       |  C H G A E

In [None]:
# do a table merging columns UL, UM and UR: name new column "UPPER"
df["UPPER"] = df["UL"].replace(pd.NA,'') + df["UM"].replace(pd.NA,'') + df["UR"].replace(pd.NA,'')
# remove empty strings
df["UPPER"] = df["UPPER"].replace('', pd.NA)
# remove numbers in the cells of UPPER (keep only letters)
df["UPPER"] = df["UPPER"].str.replace(r'\d+', '', regex=True)
# remove spaces in the cells of UPPER
df["UPPER"] = df["UPPER"].str.replace(r'\s+', '', regex=True)
# remove special characters in the cells of UPPER
df["UPPER"] = df["UPPER"].str.replace(r'[^A-Z]', '', regex=True)
# keep only non-repeated characters
df["UPPER"] = df["UPPER"].str.replace(r'(.)(?=.*\1)', '', regex=True)
df

#### Do analysis

In [None]:

# count how many times a given character appears in UPPER
max_f = 0
max_m = 0
char_f = ''
char_m = ''
largest_percent_f = 0
largest_percent_m = 0

col = "UPPER"
print("====================")
print(f"In Quadrant:{col}:")
print("====================")

total_f_inquad = df[df['SEXO'] == 'F'][col].dropna().shape[0]
total_m_inquad = df[df['SEXO'] == 'M'][col].dropna().shape[0]
print(f"Total number of M in ALL: {total_m_inquad}")
print(f"Total number of F in ALL: {total_f_inquad}")

# count how many times a given character appears in UPPER
for char in chars_to_search:
    times = df[col].str.count(char).sum()
    #if times == 0:
    #    continue
    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    times_sexo_f = df[df['SEXO'] == 'F'][col].str.count(char).sum()
    percent_f = times_sexo_f / total_f_inquad * 100
    times_sexo_m = df[df['SEXO'] == 'M'][col].str.count(char).sum()
    percent_m = times_sexo_m / total_m_inquad * 100
    if times_sexo_f > max_f:
        max_f = times_sexo_f
        char_f = char
        largest_percent_f = percent_f
    if times_sexo_m > max_m:
        max_m = times_sexo_m
        char_m = char
        largest_percent_m = percent_m
    #print(f"*** character {char} appears {times:2d} times ({times_sexo_f} times for women and {times_sexo_m} times for men)")
    #print(f"                                ({percent_f:.2f}% of women and {percent_m:.2f}% of men)")
    print(f" SEXO    CHARACTER   TIMES   PERCENT   TOTAL_IN_QUAD")
    print(f"======  =========== =======  =======   =============")
    print(f"  F         {char}          {times_sexo_f}     {percent_f:.2f}%      {total_f_inquad}")
    print(f"  M         {char}          {times_sexo_m}     {percent_m:.2f}%      { total_m_inquad}")
    if times_sexo_f == 0 and times_sexo_m == 0:
        continue


    # do an independence  fisher test
    # =================================
    table = pd.crosstab(df['SEXO'], df[col].str.count(char))
    print(f"   Observed values:")
    print("   ---------------")
    print(table)
    # save table to numpy array
    table_np = table.to_numpy()
    #print(table_np.shape)
    # do the fisher test
    _, p_value = fisher_exact(table_np)
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on SEXO (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")

    else:
        print("          ---------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on SEXO (p-value >= 0.05)")
        print("          --------------------------------------------------------------------------------------------")

print(f"       ** Most common character for women is {char_f} with {max_f} times ({largest_percent_f:.2f}%)")
print(f"       ** Most common character for men is {char_m} with {max_m} times ({largest_percent_m:.2f}%)")



### Analysis for all LOWER QUADRANTS (LL, LM, LR)

#### Create a new column (LOWER) for LL+LM+LR  
We are eliminating repetitions in characters:    
 LL   |   LM   |   LR               | LOWER
| ---- | ------ | ------------------ |   -----
4C J  |   7A 4C J   |  4C J       |  A C J

In [None]:
# do a table merging columns LL, LM and LR: name new column "LOWER"
df["LOWER"] = df["LL"].replace(pd.NA,'') + df["LM"].replace(pd.NA,'') + df["LR"].replace(pd.NA,'')
# remove empty strings
df["LOWER"] = df["LOWER"].replace('', pd.NA)
# remove numbers in the cells of UPPER (keep only letters)
df["LOWER"] = df["LOWER"].str.replace(r'\d+', '', regex=True)
# remove spaces in the cells of UPPER
df["LOWER"] = df["LOWER"].str.replace(r'\s+', '', regex=True)
# remove special characters in the cells of UPPER
df["LOWER"] = df["LOWER"].str.replace(r'[^A-Z]', '', regex=True)
# keep only non-repeated characters
df["LOWER"] = df["LOWER"].str.replace(r'(.)(?=.*\1)', '', regex=True)
#df.iloc[30:70, :]
df

#### Do analysis

In [None]:
# count how many times a given character appears in LOWER
max_f = 0
max_m = 0
char_f = ''
char_m = ''
largest_percent_f = 0
largest_percent_m = 0

col = "LOWER"
print("====================")
print(f"In Quadrant:{col}:")
print("====================")

total_f_inquad = df[df['SEXO'] == 'F'][col].dropna().shape[0]
total_m_inquad = df[df['SEXO'] == 'M'][col].dropna().shape[0]
print(f"Total number of M in ALL: {total_m_inquad}")
print(f"Total number of F in ALL: {total_f_inquad}")


# count how many times a given character appears in LOWER
for char in chars_to_search:
    times = df[col].str.count(char).sum()
    #if times == 0:
    #    continue
    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    times_sexo_f = df[df['SEXO'] == 'F'][col].str.count(char).sum()
    percent_f = times_sexo_f / total_f_inquad * 100
    times_sexo_m = df[df['SEXO'] == 'M'][col].str.count(char).sum()
    percent_m = times_sexo_m / total_m_inquad * 100
    if times_sexo_f > max_f:
        max_f = times_sexo_f
        char_f = char
        largest_percent_f = percent_f
    if times_sexo_m > max_m:
        max_m = times_sexo_m
        char_m = char
        largest_percent_m = percent_m
    #print(f"*** character {char} appears {times:2d} times ({times_sexo_f} times for women and {times_sexo_m} times for men)")
    #print(f"                                ({percent_f:.2f}% of women and {percent_m:.2f}% of men)")
    print(f" SEXO    CHARACTER   TIMES   PERCENT  TOTAL_IN_QUAD")
    print(f"======  =========== =======  ======= =============")
    print(f"  F         {char}          {times_sexo_f}     {percent_f:.2f}%      {total_f_inquad}")
    print(f"  M         {char}          {times_sexo_m}     {percent_m:.2f}%      {total_m_inquad}")

    if times_sexo_f == 0 and times_sexo_m == 0:
        continue

    
    # do an independence  fisher test
    # ===============================
    table = pd.crosstab(df['SEXO'], df[col].str.count(char))
    print(f"   Observed values:")
    print("   ---------------")
    print(table)
    # save table to numpy array
    table_np = table.to_numpy()
    
    # do the fisher test
    _, p_value = fisher_exact(table_np)
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on SEXO (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")

    else:
        print("          --------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on SEXO (p-value >= 0.05)")
        print("          --------------------------------------------------------------------------------------------")
    

print(f"       ** Most common character for women is {char_f} with {max_f} times ({largest_percent_f:.2f}%)")
print(f"       ** Most common character for men is {char_m} with {max_m} times ({largest_percent_m:.2f}%)")


### Analysis for all QUADRANTS 

#### Create a new column (ALL) for UL+UM+UR+LL+LM+LR    
We are eliminating repetitions in characters   

In [None]:
# do a table merging columns UL, UM and UR: name new column "ALL"
df["ALL"] = df["LOWER"] + df["UPPER"]
# remove empty strings
df["ALL"] = df["ALL"].replace('', pd.NA)
# remove numbers in the cells of ALL (keep only letters)
df["ALL"] = df["ALL"].str.replace(r'\d+', '', regex=True)
# remove spaces in the cells of ALL
df["ALL"] = df["ALL"].str.replace(r'\s+', '', regex=True)
# remove special characters in the cells of UPALLPER
df["ALL"] = df["ALL"].str.replace(r'[^A-Z]', '', regex=True)
# keep only non-repeated characters
df["ALL"] = df["ALL"].str.replace(r'(.)(?=.*\1)', '', regex=True)


In [None]:
df.iloc[0:50, :]

#### Do analysis

In [None]:
# count how many times a given character appears in ALL
max_f = 0
max_m = 0
char_f = ''
char_m = ''
largest_percent_f = 0
largest_percent_m = 0

col = "ALL"
print("====================")
print(f"In Quadrant:{col}:")
print("====================")
total_f_inquad = df[df['SEXO'] == 'F'][col].dropna().shape[0]
total_m_inquad = df[df['SEXO'] == 'M'][col].dropna().shape[0]
print(f"Total number of M in ALL: {total_m_inquad}")
print(f"Total number of F in ALL: {total_f_inquad}")

# count how many times a given character appears in ALL
for char in chars_to_search:
    times = df[col].str.count(char).sum()
    #if times == 0:
    #    continue
    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    times_sexo_f = df[df['SEXO'] == 'F'][col].str.count(char).sum()
    percent_f = times_sexo_f / total_f_inquad * 100
    times_sexo_m = df[df['SEXO'] == 'M'][col].str.count(char).sum()
    percent_m = times_sexo_m / total_m_inquad * 100
    if times_sexo_f > max_f:
        max_f = times_sexo_f
        char_f = char
        largest_percent_f = percent_f
    if times_sexo_m > max_m:
        max_m = times_sexo_m
        char_m = char
        largest_percent_m = percent_m
    #print(f"*** character {char} appears {times:2d} times ({times_sexo_f} times for women and {times_sexo_m} times for men)")
    #print(f"                                ({percent_f:.2f}% of women and {percent_m:.2f}% of men)")
    print(f" SEXO    CHARACTER   TIMES   PERCENT     TOTAL_IN_QUAD")
    print(f"======  =========== =======  =======     =============")
    print(f"  F         {char}          {times_sexo_f}     {percent_f:.2f}%        {total_f_inquad}")
    print(f"  M         {char}          {times_sexo_m}     {percent_m:.2f}%        {total_m_inquad}")
    
    if times_sexo_f == 0 and times_sexo_m == 0:
        continue


    # do an independence  fisher test
    # ==================================
    table = pd.crosstab(df['SEXO'], df[col].str.count(char))
    print(f"   Observed values:")
    print("   ---------------")
    print(table)
    # save table to numpy array
    table_np = table.to_numpy()
    
    # do the fisher test
    _, p_value = fisher_exact(table_np)
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on SEXO (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")

    else:
        print("          --------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on SEXO (p-value >= 0.05)")
        print("          --------------------------------------------------------------------------------------------")

print(f"       ** Most common character for women is {char_f} with {max_f} times ({largest_percent_f:.2f}%)")
print(f"       ** Most common character for men is {char_m} with {max_m} times ({largest_percent_m:.2f}%)")


## Do statistical analysis (fisher test) on  "A. GENÉTICA"   

In addition to fisher test, do an Standardized Residuals Analysis   
This is used after a test of independence to see in which cells the observation differs significantly from expectations.   

For each cell in the contingency table, the following is calculated:    

standardized residual = (observed − expected)/sqrt(expected)    

Residual values ​​> 2 or < -2 indicate a significant deviation.    

It shows you which group (ethnicity) has more (or fewer) ocurrences of 'character' than expected under independence.    
* Residual > 2 → 'char' is more frequent than expected (positively contributes to dependence)    
* Residual < -2 → 'char' is less frequent than expected.    

### Analysis for each quadrant

In [None]:
for col in quadrants_to_search:
    print("=======================================================================================================")
    print(f"In Quadrant:{col}:")
    print("=======================================================================================================")
    for char in chars_to_search:
        times = df[col].str.count(char).sum()
        #print("   ===============")
        #print(f"   Character: {char}")
        #print("   ===============")
        # count how many times the character appears 
        print(f" ORIGIN    CHARACTER   TIMES   PERCENT   TOTAL_IN_QUAD")
        print(f"======   =========== =======  =======   =============")    
        for origin in origin_values:
            times_origin = df[df['A. GENÉTICA'] == origin][col].str.count(char).sum()
            percent_origin = 0
            total_origin_inquad = df[df['A. GENÉTICA'] == origin][col].dropna().shape[0]
            if total_origin_inquad > 0:
                percent_origin = times_origin / total_origin_inquad * 100
            #print(f"   *** character {char} appears {times_origin:2d} times for A. GENÉTICA={origin} ({percent_origin:.2f}% of {origin} samples)")    
            print(f"  {origin:10s}    {char}         {times_origin}     {percent_origin:.2f}%    {total_origin_inquad}")
    
    
        if times == 0:
            continue
        
        # do an independence FISHER test
        # ================================
        table = pd.crosstab(df['A. GENÉTICA'], df[col].str.count(char))
        print(f"\n   Observed values:")
        print("   ---------------")
        print(table)
        
        # Prepare for R's fisher.test
        with localconverter(ro.default_converter + pandas2ri.converter):
            r_table = ro.conversion.py2rpy(table)

        # Import stats and apply fisher.test
        stats = importr("stats")
        res = stats.fisher_test(r_table)

        # Extract p-value
        p_value = res.rx2('p.value')[0]
        print(f"   Fisher's exact test p-value: {p_value:.4f}")
        if p_value < 0.05:
            print("   --------------------------------------------------------------------------------------")
            print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on A. GENÉTICA (p-value < 0.05)")
            print("   --------------------------------------------------------------------------------------")

            # Calculate table expected under independence
            obs = table.values
            row_totals = obs.sum(axis=1, keepdims=True)
            col_totals = obs.sum(axis=0, keepdims=True)
            grand_total = obs.sum()
            expected = row_totals @ col_totals / grand_total  # producto matricial
            # Calculate standardized residuals (Pearson)
            # ===========================================
            std_residuals = (obs - expected) / np.sqrt(expected)
            # Convert to DataFrame for better visualization
            res_df = pd.DataFrame(std_residuals, index=table.index, columns=table.columns)
            # Show standardized residuals
            print("Standardized residuals (simple):")
            print(res_df)

            # Calculate adjusted standardized residuals (Pearson)
            # =====================================================
            adjusted_residuals = (obs - expected) / np.sqrt(expected * (1 - row_totals / grand_total) * (1 - col_totals / grand_total))
            adj_resid_df = pd.DataFrame(adjusted_residuals, index=table.index, columns=table.columns)

            # Visualization
            plt.figure(figsize=(6, 4))
            #sns.heatmap(res_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
            sns.heatmap(adj_resid_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
            plt.title("adjusted standardized residuals (Pearson)")
            plt.xlabel(f"Quadrant {col} - Character {char}")
            plt.show()

        else:
            print("          --------------------------------------------------------------------------------------------")
            print(f"          In quadrant {col} there is no evidence that character {char} depends on A. GENÉTICA (p-value >= 0.05)")
            print("          --------------------------------------------------------------------------------------------")

        

### Analysis for all UPPER

In [None]:
col = "UPPER"
print("=======================================================================================================")
print(f"In Quadrant:{col}:")
print("=======================================================================================================")
for char in chars_to_search:
    times = df[col].str.count(char).sum()
    if times == 0:
        continue
    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    print(f" ORIGIN    CHARACTER   TIMES   PERCENT   TOTAL_IN_QUAD")
    print(f"======   =========== =======  =======   =============")    
    # count how many times the character appears 
    for origin in origin_values:
        times_origin = df[df['A. GENÉTICA'] == origin][col].str.count(char).sum()
        percent_origin = 0
        total_origin_inquad = df[df['A. GENÉTICA'] == origin][col].dropna().shape[0]
        if total_origin_inquad > 0:
            percent_origin = times_origin / total_origin_inquad * 100
        #print(f"   *** character {char} appears {times_origin:2d} times for A. GENÉTICA={origin} ({percent_origin:.2f}% of {origin} samples)")
        print(f"  {origin:10s}    {char}         {times_origin}     {percent_origin:.2f}%     {total_origin_inquad}")
    if times == 0:
        continue
        
    # do an independence fisher test
    #=================================
    table = pd.crosstab(df['A. GENÉTICA'], df[col].str.count(char))
    print(f"\n   Observed values:")
    print("   ---------------")
    print(table)
    
    # Prepare for R's fisher.test
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_table = ro.conversion.py2rpy(table)

    # Import stats and apply fisher.test
    stats = importr("stats")
    res = stats.fisher_test(r_table)

    # Extract p-value
    p_value = res.rx2('p.value')[0]
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on A. GENÉTICA (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")
        # Calculate table expected under independence
        obs = table.values
        row_totals = obs.sum(axis=1, keepdims=True)
        col_totals = obs.sum(axis=0, keepdims=True)
        grand_total = obs.sum()
        expected = row_totals @ col_totals / grand_total  # producto matricial
        # Calcular standardized residuals
        # ===============================
        std_residuals = (obs - expected) / np.sqrt(expected)
        # Convert to DataFrame for better visualization
        res_df = pd.DataFrame(std_residuals, index=table.index, columns=table.columns)

        # Calculate adjusted standardized residuals (Pearson)
        # =====================================================
        adjusted_residuals = (obs - expected) / np.sqrt(expected * (1 - row_totals / grand_total) * (1 - col_totals / grand_total))
        adj_resid_df = pd.DataFrame(adjusted_residuals, index=table.index, columns=table.columns)
        
        # Show standardized residuals
        print("Standardized residuals (simple):")
        print(res_df)
        # Visualization
        plt.figure(figsize=(6, 4))
        #sns.heatmap(res_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        sns.heatmap(adj_resid_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        plt.title("adjusted standardized residuals (Pearson)")
        plt.xlabel(f"Quadrant {col} - Character {char}")
        plt.show()
    else:
        print("   ---------------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on A. GENÉTICA (p-value >= 0.05)")
        print("   ---------------------------------------------------------------------------------------------------")

    

### Analysis for all LOWER

In [None]:
col = "LOWER"
print("=======================================================================================================")
print(f"In Quadrant:{col}:")
print("=======================================================================================================")
for char in chars_to_search:
    times = df[col].str.count(char).sum()
    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    print(f" ORIGIN    CHARACTER   TIMES   PERCENT   TOTAL_IN_QUAD")
    print(f"======   =========== =======  =======   =============")    
    # count how many times the character appears 
    for origin in origin_values:
        times_origin = df[df['A. GENÉTICA'] == origin][col].str.count(char).sum()
        percent_origin = 0
        total_origin_inquad = df[df['A. GENÉTICA'] == origin][col].dropna().shape[0]
        if total_origin_inquad > 0:
            percent_origin = times_origin / total_origin_inquad * 100
        #print(f"   *** character {char} appears {times_origin:2d} times for A. GENÉTICA={origin} ({percent_origin:.2f}% of {origin} samples)")
        print(f"  {origin:10s}    {char}         {times_origin}     {percent_origin:.2f}%    {total_origin_inquad}")
    if times == 0:
        continue
    # do an independence fisher test
    # ===============================
    table = pd.crosstab(df['A. GENÉTICA'], df[col].str.count(char))
    print(f"\n   Observed values:")
    print("   ---------------")
    print(table)
    
    # preprare for R's fisher.test
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_table = ro.conversion.py2rpy(table)

    # import stats and apply fisher.test
    stats = importr("stats")
    res = stats.fisher_test(r_table)

    # extract p-value
    p_value = res.rx2('p.value')[0]
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on A. GENÉTICA (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")
        # Calculate table expected under independence
        obs = table.values
        row_totals = obs.sum(axis=1, keepdims=True)
        col_totals = obs.sum(axis=0, keepdims=True)
        grand_total = obs.sum()
        expected = row_totals @ col_totals / grand_total  # producto matricial
        # Calculate standardized residuals
        # ================================
        std_residuals = (obs - expected) / np.sqrt(expected)
        # Convert to DataFrame for better visualization
        res_df = pd.DataFrame(std_residuals, index=table.index, columns=table.columns)

        # Calculate adjusted standardized residuals (Pearson)
        # =====================================================
        adjusted_residuals = (obs - expected) / np.sqrt(expected * (1 - row_totals / grand_total) * (1 - col_totals / grand_total))
        adj_resid_df = pd.DataFrame(adjusted_residuals, index=table.index, columns=table.columns)
    
        # Show standardized residuals
        print("Standardized residuals (simple):")
        print(res_df)
        # Visualization
        plt.figure(figsize=(6, 4))
        #sns.heatmap(res_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        sns.heatmap(adj_resid_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        plt.title("adjusted standardized residuals (Pearson)")
        plt.xlabel(f"Quadrant {col} - Character {char}")
        plt.show()
    else:
        print("   ---------------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on A. GENÉTICA (p-value >= 0.05)")
        print("   ---------------------------------------------------------------------------------------------------")

    

### Analysis for ALL

In [None]:
col = "ALL"
print("=======================================================================================================")
print(f"In Quadrant:{col}:")
print("=======================================================================================================")
for char in chars_to_search:
    times = df[col].str.count(char).sum()

    #print("   ===============")
    #print(f"   Character: {char}")
    #print("   ===============")
    print(f" ORIGIN    CHARACTER   TIMES   PERCENT   TOTAL_IN_QUAD")
    print(f"======   =========== =======  =======   =============")    
    # count how many times the character appears 
    for origin in origin_values:
        times_origin = df[df['A. GENÉTICA'] == origin][col].str.count(char).sum()
        percent_origin = 0
        total_origin_inquad = df[df['A. GENÉTICA'] == origin][col].dropna().shape[0]
        if total_origin_inquad > 0:
            percent_origin = times_origin / total_origin_inquad * 100
        #print(f"   *** character {char} appears {times_origin:2d} times for A. GENÉTICA={origin} ({percent_origin:.2f}% of {origin} samples)")
        print(f"  {origin:10s}    {char}         {times_origin}     {percent_origin:.2f}%     {total_origin_inquad}")
    if times == 0:
        continue
    # do an independence fisher test
    # ================================
    table = pd.crosstab(df['A. GENÉTICA'], df[col].str.count(char))
    print(f"\n   Observed values:")
    print("   ---------------")
    print(table)
    print(table.values)
    # if table has only one column, skip the test
    if table.shape[1] == 1:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is in all A. GENéTICA (only one column)")
        print("   ****** Residuals cannot be calculated (only one column in the table)")
        print("   --------------------------------------------------------------------------------------")
        continue
    # Activate conversion and transfer the table to R
    with localconverter(ro.default_converter + pandas2ri.converter):
        r_table = ro.conversion.py2rpy(table)

    # Import stats and apply Fisher.test
    stats = importr("stats")
    res = stats.fisher_test(r_table)

    # Extract p-value
    p_value = res.rx2('p.value')[0]
    print(f"   Fisher's exact test p-value: {p_value:.4f}")
    if p_value < 0.05:
        print("   --------------------------------------------------------------------------------------")
        print(f"   ****** In quadrant {col } the character {char} is _DEPENDENT_ on A. GENÉTICA (p-value < 0.05)")
        print("   --------------------------------------------------------------------------------------")
        # Calculate table expected under independence
        obs = table.values
        row_totals = obs.sum(axis=1, keepdims=True)
        col_totals = obs.sum(axis=0, keepdims=True)
        grand_total = obs.sum()
        expected = row_totals @ col_totals / grand_total  # producto matricial
        # Calculate standardized residuals
        # ===============================
        std_residuals = (obs - expected) / np.sqrt(expected)
        # Convert to DataFrame for better visualization
        res_df = pd.DataFrame(std_residuals, index=table.index, columns=table.columns)
        # Show standardized residuals
        print("Standardized residuals (simple):")
        print(res_df)

        # Calculate adjusted standardized residuals (Pearson)
        # =====================================================
        adjusted_residuals = (obs - expected) / np.sqrt(expected * (1 - row_totals / grand_total) * (1 - col_totals / grand_total))
        adj_resid_df = pd.DataFrame(adjusted_residuals, index=table.index, columns=table.columns)
        
        # Visualization
        plt.figure(figsize=(6, 4))
        #sns.heatmap(res_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        sns.heatmap(adj_resid_df, annot=True, cmap='coolwarm', center=0, fmt=".2f")
        plt.title("adjusted standardized residuals (Pearson)")
        plt.xlabel(f"Quadrant {col} - Character {char}")
        plt.show()
    else:
        print("   ---------------------------------------------------------------------------------------------------")
        print(f"          In quadrant {col} there is no evidence that character {char} depends on A. GENÉTICA (p-value >= 0.05)")
        print("   ---------------------------------------------------------------------------------------------------")

    