In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Carga de los datasets
df_2016 = pd.read_csv("../datasets/OSMI_Mental_Health_in_Tech_Survey_2016.csv")
df_2017 = pd.read_csv("../datasets/OSMI_Mental_Health_in_Tech_Survey_2017.csv")
df_2018 = pd.read_csv("../datasets/OSMI_Mental_Health_in_Tech_Survey_2018.csv")
df_2019 = pd.read_csv("../datasets/OSMI_Mental_Health_in_Tech_Survey_2019.csv")
df_2020 = pd.read_csv("../datasets/OSMI_Mental_Health_in_Tech_Survey_2020.csv")

## Exploración de los datasets

In [5]:
print(f'Dataset 2016 - {df_2016.shape}')
print(f'Dataset 2017 - {df_2017.shape}')
print(f'Dataset 2018 - {df_2018.shape}')
print(f'Dataset 2019 - {df_2019.shape}')
print(f'Dataset 2020 - {df_2020.shape}')

Dataset 2016 - (1433, 63)
Dataset 2017 - (756, 123)
Dataset 2018 - (417, 123)
Dataset 2019 - (352, 82)
Dataset 2020 - (180, 120)


In [8]:
df_2016.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1433 entries, 0 to 1432
Data columns (total 63 columns):
 #   Column                                                                                                                                                                            Non-Null Count  Dtype  
---  ------                                                                                                                                                                            --------------  -----  
 0   Are you self-employed?                                                                                                                                                            1433 non-null   int64  
 1   How many employees does your company or organization have?                                                                                                                        1146 non-null   object 
 2   Is your employer primarily a tech company/organization?     

In [15]:
df_2017.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 756 entries, 0 to 755
Data columns (total 123 columns):
 #    Column                                                                                                                                                                                                                                   Non-Null Count  Dtype  
---   ------                                                                                                                                                                                                                                   --------------  -----  
 0    #                                                                                                                                                                                                                                        756 non-null    object 
 1    <strong>Are you self-employed?</strong>                                                            

In [16]:
df_2018.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 417 entries, 0 to 416
Data columns (total 123 columns):
 #    Column                                                                                                                                                                                                                                    Non-Null Count  Dtype  
---   ------                                                                                                                                                                                                                                    --------------  -----  
 0    #                                                                                                                                                                                                                                         417 non-null    object 
 1    <strong>Are you self-employed?</strong>                                                         

In [12]:
df_2019.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 82 columns):
 #   Column                                                                                                                                                                                                                            Non-Null Count  Dtype  
---  ------                                                                                                                                                                                                                            --------------  -----  
 0   *Are you self-employed?*                                                                                                                                                                                                          352 non-null    bool   
 1   How many employees does your company or organization have?                                                                   

In [17]:
df_2020.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 120 columns):
 #    Column                                                                                                                                                                                                                            Non-Null Count  Dtype  
---   ------                                                                                                                                                                                                                            --------------  -----  
 0    #                                                                                                                                                                                                                                 180 non-null    object 
 1    *Are you self-employed?*                                                                                                

In [26]:
# Me doy cuenta que hay columnas que contienen etiquestas: <strong> y números ".1" o ".2"
# Intento limpiar el nombre de las columnas

def limpiar_nombres(df):    
    # Buscamos columnas que terminen en punto y números
    cols_duplicadas = df.columns[df.columns.str.contains(r'\.\d+$', regex=True)]
    
    # Si existen, las borramos del DataFrame directamente (inplace)
    if len(cols_duplicadas) > 0:
        df.drop(columns=cols_duplicadas, inplace=True)
    
    # Eliminar etiquetas HTML
    df.columns = df.columns.str.replace(r'<.*?>', '', regex=True)

    # 3. Eliminar asteriscos (*)
    df.columns = df.columns.str.replace(r'\*', '', regex=True)
    
    # Quitar espacios en blanco extra (trim)
    df.columns = df.columns.str.strip()
    
    return df

lista_dfs = [df_2016, df_2017, df_2018, df_2019, df_2020]

for df in lista_dfs:
    limpiar_nombres(df)


In [27]:
# Encontrar columnas coincidentes en todos los datasets
columnas_comunes = set(df_2016.columns) & set(df_2017.columns) & set(df_2018.columns) & set(df_2019.columns) & set(df_2020.columns)
print(f'Columnas coincidentes: {len(columnas_comunes)}')
print(columnas_comunes)

Columnas coincidentes: 28
{'What country do you work in?', 'Have you had a mental health disorder in the past?', 'Has your employer ever formally discussed mental health (for example, as part of a wellness campaign or other official communication)?', 'Did your previous employers ever formally discuss mental health (as part of a wellness campaign or other official communication)?', 'How many employees does your company or organization have?', 'What is your age?', 'Would you be willing to bring up a physical health issue with a potential employer in an interview?', 'Have you observed or experienced an unsupportive or badly handled response to a mental health issue in your current or previous workplace?', 'Have your previous employers provided mental health benefits?', 'What US state or territory do you work in?', 'If yes, what percentage of your work time (time performing primary or secondary job functions) is affected by a mental health issue?', 'Are you self-employed?', 'Is your anonym

In [28]:
# Analizamos las columnas diferentes de cada dataset
datasets = {
    "2016": df_2016,
    "2017": df_2017,
    "2018": df_2018,
    "2019": df_2019,
    "2020": df_2020
}
for anio, df in datasets.items():
    # Calculamos la diferencia: (Columnas de este año) - (Columnas comunes)
    cols_diferentes = set(df.columns) - columnas_comunes
    
    print(f"--- Dataset {anio} ---")
    print(f"Total columnas únicas de este año: {len(cols_diferentes)}")
    
    if len(cols_diferentes) > 0:
        print(sorted(list(cols_diferentes)))
    else:
        print("No tiene columnas extra (coincide exactamente con las comunes).")
    
    print("\n")

--- Dataset 2016 ---
Total columnas únicas de este año: 34
['Did you feel that your previous employers took mental health as seriously as physical health?', 'Did you hear of or observe negative consequences for co-workers with mental health issues in your previous workplaces?', 'Did your previous employers provide resources to learn more about mental health issues and how to seek help?', 'Do you feel that being identified as a person with a mental health issue would hurt your career?', 'Do you feel that your employer takes mental health as seriously as physical health?', 'Do you have medical coverage (private insurance or state-provided) which includes treatment of \xa0mental health issues?', 'Do you know local or online resources to seek help for a mental health disorder?', 'Do you know the options for mental health care available under your employer-provided coverage?', 'Do you think that discussing a mental health disorder with previous employers would have negative consequences?', 

In [29]:
# Si la encuesta (survey) ha sido matizada o actualizada en los años posteriores. Quiero saber las columnas que
# coinciden sin contar 2016.

nuevas_columnas_comunes = (set(df_2017.columns) - columnas_comunes) & (set(df_2018.columns) - columnas_comunes) & (set(df_2019.columns) - columnas_comunes) & (set(df_2020.columns) - columnas_comunes)
print(f'Nuevas columnas coincidentes: {len(nuevas_columnas_comunes)}')
print(nuevas_columnas_comunes)


Nuevas columnas coincidentes: 44
{'Describe the conversation you had with your employer about your mental health, including their reactions and what actions were taken to address your mental health issue/questions.', 'Did your previous employers provide resources to learn more about mental health disorders and how to seek help?', 'Overall, how much importance does your employer place on physical health?', 'Do you know local or online resources to seek help for a mental health issue?', 'Have you ever discussed your mental health with coworkers?', 'Overall, how much importance did your previous employer place on mental health?', 'Describe the circumstances of the badly handled or unsupportive response.', 'If you have revealed a mental health disorder to a client or business contact, how has this affected you or the relationship?', 'Have you ever been diagnosed with a mental health disorder?', 'Are you openly identified at work as a person with a mental health issue?', 'Would you have fel