In [1]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


In [7]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/Mental-health-Depression-disorder-Data.csv", index_col = 0)

display(df.head(10))


Unnamed: 0_level_0,Entity,Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Afghanistan,AFG,1990,0.16056,0.697779,0.101855,4.82883,1.677082,4.071831,0.672404
1,Afghanistan,AFG,1991,0.160312,0.697961,0.099313,4.82974,1.684746,4.079531,0.671768
2,Afghanistan,AFG,1992,0.160135,0.698107,0.096692,4.831108,1.694334,4.088358,0.670644
3,Afghanistan,AFG,1993,0.160037,0.698257,0.094336,4.830864,1.70532,4.09619,0.669738
4,Afghanistan,AFG,1994,0.160022,0.698469,0.092439,4.829423,1.716069,4.099582,0.66926
5,Afghanistan,AFG,1995,0.160076,0.698695,0.09098,4.828337,1.728112,4.104207,0.668746
6,Afghanistan,AFG,1996,0.160249,0.698914,0.089709,4.828083,1.737643,4.1075,0.667727
7,Afghanistan,AFG,1997,0.160554,0.699127,0.088372,4.827726,1.746891,4.110834,0.66622
8,Afghanistan,AFG,1998,0.160931,0.699372,0.08733,4.826971,1.756963,4.114438,0.664676
9,Afghanistan,AFG,1999,0.161311,0.699674,0.086267,4.826413,1.770791,4.117633,0.663428


In [23]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

In [24]:
exploracion(df)

El DataFrame tiene 108553 filas y 10 columnas.
Tiene 9718 datos duplicados, lo que supone un porcentaje de 0.09% de los datos.
Hay 8 columnas con datos nulos, y son:
['Code', 'Schizophrenia (%)', 'Bipolar disorder (%)', 'Eating disorders (%)', 'Anxiety disorders (%)', 'Drug use disorders (%)', 'Depression (%)', 'Alcohol use disorders (%)']
y sin nulos hay 2 columnas y son:
['Entity', 'Year']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Entity,0.0%,100.0%,object,276
Code,4.99%,95.01%,object,236
Year,0.0%,100.0%,object,259
Schizophrenia (%),76.16%,23.84%,object,25661
Bipolar disorder (%),82.12%,17.88%,object,19358


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Entity,108553,276,United Kingdom,496
Code,103141,236,YEM,496
Year,108553,259,2005,1012
Schizophrenia (%),25875,25661,0.191897,3
Bipolar disorder (%),19406,19358,0.603732,3
Eating disorders (%),100236,72657,6000,165


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Anxiety disorders (%),6468.0,3.989921,1.167526,2.023393,3.188824,3.554373,4.682163,8.96733
Drug use disorders (%),6468.0,0.862278,0.460679,0.38365,0.535064,0.72643,0.940157,3.452476
Depression (%),6468.0,3.497654,0.655859,2.139903,3.005529,3.499606,3.912381,6.602754
Alcohol use disorders (%),6468.0,1.585821,0.860283,0.44694,0.993685,1.479936,1.867834,5.474668


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Entity,0.0%,100.0%,object,276
Code,4.99%,95.01%,object,236
Year,0.0%,100.0%,object,259
Schizophrenia (%),76.16%,23.84%,object,25661
Bipolar disorder (%),82.12%,17.88%,object,19358
Eating disorders (%),7.66%,92.34%,object,72657
Anxiety disorders (%),94.04%,5.96%,float64,6457
Drug use disorders (%),94.04%,5.96%,float64,6435
Depression (%),94.04%,5.96%,float64,6460
Alcohol use disorders (%),94.04%,5.96%,float64,6459


# Nombres columnas

In [10]:
print(f'Nombres columnas df: {df.columns}')

Nombres columnas df: Index(['Entity', 'Code', 'Year', 'Schizophrenia (%)', 'Bipolar disorder (%)',
       'Eating disorders (%)', 'Anxiety disorders (%)',
       'Drug use disorders (%)', 'Depression (%)',
       'Alcohol use disorders (%)'],
      dtype='object')


# Informacion df

In [11]:
# Informacion general 
print(f'Informacion df: {df.info()}')

<class 'pandas.core.frame.DataFrame'>
Index: 108553 entries, 0 to 108552
Data columns (total 10 columns):
 #   Column                     Non-Null Count   Dtype  
---  ------                     --------------   -----  
 0   Entity                     108553 non-null  object 
 1   Code                       103141 non-null  object 
 2   Year                       108553 non-null  object 
 3   Schizophrenia (%)          25875 non-null   object 
 4   Bipolar disorder (%)       19406 non-null   object 
 5   Eating disorders (%)       100236 non-null  object 
 6   Anxiety disorders (%)      6468 non-null    float64
 7   Drug use disorders (%)     6468 non-null    float64
 8   Depression (%)             6468 non-null    float64
 9   Alcohol use disorders (%)  6468 non-null    float64
dtypes: float64(4), object(6)
memory usage: 9.1+ MB
Informacion df: None


# PDTE: REVISAR TIPO OBJECT 🚩

# % Nulos

In [12]:
nulos = df.isnull().sum()

# % Nulos
print(f'% Nulos df: {(nulos/df.shape[0]*100).round(2)}')

% Nulos df: Entity                        0.00
Code                          4.99
Year                          0.00
Schizophrenia (%)            76.16
Bipolar disorder (%)         82.12
Eating disorders (%)          7.66
Anxiety disorders (%)        94.04
Drug use disorders (%)       94.04
Depression (%)               94.04
Alcohol use disorders (%)    94.04
dtype: float64


# PDTE: REVISAR LOS NULOS 🚩

# Duplicados 

In [13]:
print(f'Los duplicados que tenemos en df: {df.duplicated().sum()}')

Los duplicados que tenemos en df: 9718


In [14]:
# con keep=False cuenta todas las filas (originales y duplicadas)
df.duplicated(keep=False).sum()

19436

In [19]:
def get_duplicate_rows(df):
    """
    Encuentra y devuelve las filas completas duplicadas de un DataFrame.

    Args:
        df (pd.DataFrame): El DataFrame de entrada.

    Returns:
        pd.DataFrame: Un nuevo DataFrame con las filas duplicadas completas.
    """
    # Identificar duplicados basados en todas las columnas
    duplicate_rows = df[df.duplicated(keep=False)]
    return duplicate_rows

In [20]:
get_duplicate_rows(df)

Unnamed: 0_level_0,Entity,Code,Year,Schizophrenia (%),Bipolar disorder (%),Eating disorders (%),Anxiety disorders (%),Drug use disorders (%),Depression (%),Alcohol use disorders (%)
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6469,Afghanistan,AFG,1800,,,3280000,,,,
6470,Afghanistan,AFG,1801,,,3280000,,,,
6471,Afghanistan,AFG,1802,,,3280000,,,,
6472,Afghanistan,AFG,1803,,,3280000,,,,
6473,Afghanistan,AFG,1804,,,3280000,,,,
...,...,...,...,...,...,...,...,...,...,...
65530,Democratic Republic of Congo,COD,1985,,,29881000.000000,,,,
65531,Democratic Republic of Congo,COD,1986,,,30684000.000000,,,,
65532,Democratic Republic of Congo,COD,1987,,,31529000.000000,,,,
65533,Democratic Republic of Congo,COD,1988,,,32444000.000000,,,,


# PDTE: REVISAR LOS DUPLICADOS 🚩
- Hacerlo despues de cambiar el tipo de dato

# Valores negativos?

In [15]:
filas_negativas_df = df[(df < 0).any(axis=1)]
display(filas_negativas_df)

TypeError: '<' not supported between instances of 'str' and 'int'

In [22]:
exploracion(df)

El DataFrame tiene 108553 filas y 10 columnas.
Tiene 9718 datos duplicados, lo que supone un porcentaje de 0.09% de los datos.
Hay 8 columnas con datos nulos, y son:
['Code', 'Schizophrenia (%)', 'Bipolar disorder (%)', 'Eating disorders (%)', 'Anxiety disorders (%)', 'Drug use disorders (%)', 'Depression (%)', 'Alcohol use disorders (%)']
y sin nulos hay 2 columnas y son:
['Entity', 'Year']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Entity,0.0%,100.0%,object,276
Code,4.99%,95.01%,object,236
Year,0.0%,100.0%,object,259
Schizophrenia (%),76.16%,23.84%,object,25661
Bipolar disorder (%),82.12%,17.88%,object,19358


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Entity,108553,276,United Kingdom,496
Code,103141,236,YEM,496
Year,108553,259,2005,1012
Schizophrenia (%),25875,25661,0.191897,3
Bipolar disorder (%),19406,19358,0.603732,3
Eating disorders (%),100236,72657,6000,165


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Anxiety disorders (%),6468.0,3.989921,1.167526,2.023393,3.188824,3.554373,4.682163,8.96733
Drug use disorders (%),6468.0,0.862278,0.460679,0.38365,0.535064,0.72643,0.940157,3.452476
Depression (%),6468.0,3.497654,0.655859,2.139903,3.005529,3.499606,3.912381,6.602754
Alcohol use disorders (%),6468.0,1.585821,0.860283,0.44694,0.993685,1.479936,1.867834,5.474668


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Entity,0.0%,100.0%,object,276
Code,4.99%,95.01%,object,236
Year,0.0%,100.0%,object,259
Schizophrenia (%),76.16%,23.84%,object,25661
Bipolar disorder (%),82.12%,17.88%,object,19358
Eating disorders (%),7.66%,92.34%,object,72657
Anxiety disorders (%),94.04%,5.96%,float64,6457
Drug use disorders (%),94.04%,5.96%,float64,6435
Depression (%),94.04%,5.96%,float64,6460
Alcohol use disorders (%),94.04%,5.96%,float64,6459
