# Fase 2: Limpieza del Dataset

In [2]:
# Importación de librerías necesarias
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', None) 

Comenzamos la lectura del archivo .csv que contiene la data de la investigación. Los datos que se muestran han sido explorados en la primera fase del proyecto, no existen valores nulos ni duplicados pero si vemos necesario realizar un par de modificaciones para estandarizar los datos y mejorar la lectura de las variables para establecer relaciones.

In [3]:
df_limpieza = pd.read_csv("human_cognitive_performance.csv")


df_limpieza.sample(3)

Unnamed: 0,User_ID,Age,Gender,Sleep_Duration,Stress_Level,Diet_Type,Daily_Screen_Time,Exercise_Frequency,Caffeine_Intake,Reaction_Time,Memory_Test_Score,Cognitive_Score,AI_Predicted_Score
55480,U55481,18,Female,4.7,9,Non-Vegetarian,4.1,Low,64,476.83,55,27.0,23.39
76775,U76776,19,Male,5.9,9,Non-Vegetarian,7.6,Medium,482,460.56,56,29.0,25.18
72415,U72416,41,Female,4.5,1,Non-Vegetarian,9.2,Medium,11,204.03,85,100.0,97.41


**1. Cambiar las columnas de mayúscula a minúscula.**

In [4]:
df_limpieza.columns

Index(['User_ID', 'Age', 'Gender', 'Sleep_Duration', 'Stress_Level',
       'Diet_Type', 'Daily_Screen_Time', 'Exercise_Frequency',
       'Caffeine_Intake', 'Reaction_Time', 'Memory_Test_Score',
       'Cognitive_Score', 'AI_Predicted_Score'],
      dtype='object')

¿Qué columnas tienen categóricas nominales ("object")?

In [5]:
df_limpieza.describe(include="O").T

Unnamed: 0,count,unique,top,freq
User_ID,80000,80000,U1,1
Gender,80000,3,Female,38404
Diet_Type,80000,3,Non-Vegetarian,47986
Exercise_Frequency,80000,3,Medium,31990


¿Qué columnas tienen números discretos ("int")?

In [6]:
df_limpieza.describe(include="int").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,80000.0,38.525525,12.101876,18.0,28.0,39.0,49.0,59.0
Stress_Level,80000.0,5.493762,2.865308,1.0,3.0,5.0,8.0,10.0
Caffeine_Intake,80000.0,248.988213,144.54199,0.0,123.0,249.0,375.0,499.0
Memory_Test_Score,80000.0,69.49835,17.305659,40.0,55.0,70.0,85.0,99.0


¿Qué columnas tienen números discretos con decimales ("float")?

In [7]:
df_limpieza.describe(include="float").T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sleep_Duration,80000.0,7.005332,1.734435,4.0,5.5,7.0,8.5,10.0
Daily_Screen_Time,80000.0,6.504646,3.167072,1.0,3.8,6.5,9.2,12.0
Reaction_Time,80000.0,399.973579,115.369329,200.0,300.15,400.36,499.25,599.99
Cognitive_Score,80000.0,58.172395,23.058522,0.0,40.91,58.36,75.83,100.0
AI_Predicted_Score,80000.0,58.121357,23.119598,0.0,40.85,58.36,75.89,100.0


In [8]:
"""Función para renombrar las columnas"""

df_limpieza.rename(columns={columna: columna.lower() for columna in df_limpieza.columns}, inplace=True) #inplace=True, indica que la modificación se debe realizar directamente sobre el df

In [9]:
df_limpieza.columns # Comprobación

Index(['user_id', 'age', 'gender', 'sleep_duration', 'stress_level',
       'diet_type', 'daily_screen_time', 'exercise_frequency',
       'caffeine_intake', 'reaction_time', 'memory_test_score',
       'cognitive_score', 'ai_predicted_score'],
      dtype='object')

**2. Eliminamos la "U" en "user_id"**

In [10]:
def limpiar_columna(serie):
    return (
        serie
        .astype(str)           # aseguro que todo sea string
        .str.replace("U", "")  # eliminar "U"
        .astype(int)         # convierto a número
    )

In [11]:
columnas_a_convertir = ['user_id']

for col in columnas_a_convertir:
    df_limpieza[col] = limpiar_columna(df_limpieza[col])

In [12]:
df_limpieza.sample(3) # Comprobamos cambios

Unnamed: 0,user_id,age,gender,sleep_duration,stress_level,diet_type,daily_screen_time,exercise_frequency,caffeine_intake,reaction_time,memory_test_score,cognitive_score,ai_predicted_score
44868,44869,20,Female,9.1,4,Non-Vegetarian,5.4,High,450,274.36,88,100.0,100.0
68897,68898,46,Male,4.6,7,Vegetarian,8.3,Medium,89,256.14,48,67.28,69.87
32815,32816,26,Male,8.8,2,Non-Vegetarian,9.2,Low,296,543.55,99,47.79,46.59


**3.Convertimos a indice 'user_id'**

In [13]:
df_limpieza.set_index('user_id', inplace=True)

In [14]:
df_limpieza.sample(10)

Unnamed: 0_level_0,age,gender,sleep_duration,stress_level,diet_type,daily_screen_time,exercise_frequency,caffeine_intake,reaction_time,memory_test_score,cognitive_score,ai_predicted_score
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
66896,23,Male,4.6,7,Non-Vegetarian,8.9,Low,475,400.0,53,27.18,27.75
17384,43,Male,5.0,1,Vegetarian,2.2,Low,345,259.11,97,98.12,98.92
60706,45,Female,9.8,1,Vegan,10.6,Low,156,238.32,67,87.36,87.4
55764,18,Female,6.1,2,Non-Vegetarian,11.5,Medium,281,586.29,97,41.12,40.22
14796,28,Male,4.7,8,Non-Vegetarian,6.0,High,114,294.87,78,81.98,80.12
31434,37,Female,9.8,5,Non-Vegetarian,4.4,Medium,324,426.15,52,56.5,55.24
62801,25,Female,4.3,9,Non-Vegetarian,10.2,High,295,366.62,73,54.8,59.54
7413,22,Female,8.7,7,Non-Vegetarian,2.0,Medium,98,219.29,50,91.89,93.2
16026,19,Male,8.5,8,Vegetarian,8.3,Medium,202,334.15,79,73.32,71.13
35971,21,Male,6.5,8,Vegan,9.6,High,186,358.88,83,70.57,67.73


In [15]:
df_limpieza.to_csv("df_test.csv")