In [None]:
import pandas as pd
import numpy as np

ModuleNotFoundError: No module named 'seaborn'

In [4]:
df = pd.read_csv('student_productivity_distraction_dataset_20000.csv')
print(f"Forma del dataset: {df.shape}")
print("\nTipos de datos:")
print(df.dtypes)
print("\nPrimeras 5 filas:")
print(df.head())
print("\nEstadísticos descriptivos:")
print(df.describe())

Forma del dataset: (20000, 18)

Tipos de datos:
student_id                 int64
age                        int64
gender                    object
study_hours_per_day      float64
sleep_hours              float64
phone_usage_hours        float64
social_media_hours       float64
youtube_hours            float64
gaming_hours             float64
breaks_per_day             int64
coffee_intake_mg           int64
exercise_minutes           int64
assignments_completed      int64
attendance_percentage    float64
stress_level               int64
focus_score                int64
final_grade              float64
productivity_score       float64
dtype: object

Primeras 5 filas:
   student_id  age  gender  study_hours_per_day  sleep_hours  \
0           1   23  Female                 4.35         3.63   
1           2   20    Male                 6.14         6.58   
2           3   29  Female                 4.98         3.26   
3           4   27  Female                 3.19         4.58   
4    

In [5]:
# Missing values
print("Valores faltantes por columna:")
print(df.isnull().sum())

# Duplicados
print(f"\nDuplicados totales: {df.duplicated().sum()}")

# Outliers en variables clave (precio → productivity_score)
def detectar_outliers(df, columna):
    Q1 = df[columna].quantile(0.25)
    Q3 = df[columna].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[columna] < Q1 - 1.5*IQR) | (df[columna] > Q3 + 1.5*IQR)]
    return len(outliers)

print(f"\nOutliers en productivity_score: {detectar_outliers(df, 'productivity_score')}")
print(f"Outliers en final_grade: {detectar_outliers(df, 'final_grade')}")

# Inconsistencias
print(f"\nEdades inválidas (<15 o >35): {(df['age'] < 15) | (df['age'] > 35).sum()}")
print(f"Stress_level fuera de rango: {(df['stress_level'] < 1) | (df['stress_level'] > 10).sum()}")


Valores faltantes por columna:
student_id               0
age                      0
gender                   0
study_hours_per_day      0
sleep_hours              0
phone_usage_hours        0
social_media_hours       0
youtube_hours            0
gaming_hours             0
breaks_per_day           0
coffee_intake_mg         0
exercise_minutes         0
assignments_completed    0
attendance_percentage    0
stress_level             0
focus_score              0
final_grade              0
productivity_score       0
dtype: int64

Duplicados totales: 0

Outliers en productivity_score: 18
Outliers en final_grade: 0

Edades inválidas (<15 o >35): 0        False
1        False
2        False
3        False
4        False
         ...  
19995    False
19996    False
19997    False
19998    False
19999    False
Name: age, Length: 20000, dtype: bool
Stress_level fuera de rango: 0        False
1        False
2        False
3        False
4        False
         ...  
19995    False
19996    False
1

In [None]:
# Backup
df_clean = df.copy()

# 1. Eliminar duplicados
df_clean.drop_duplicates(inplace=True)
print(f"Post-duplicados: {df_clean.shape}")

# 2. Manejar outliers (productivity_score)
Q1 = df_clean['productivity_score'].quantile(0.25)
Q3 = df_clean['productivity_score'].quantile(0.75)
IQR = Q3 - Q1
df_clean = df_clean[(df_clean['productivity_score'] >= Q1 - 1.5*IQR) & 
                   (df_clean['productivity_score'] <= Q3 + 1.5*IQR)]

# 3. Filtrar edades razonables (17-30 años)
df_clean = df_clean[(df_clean['age'] >= 17) & (df_clean['age'] <= 30)]

# 4. Corregir tipos
df_clean['attendance_percentage'] = pd.to_numeric(df_clean['attendance_percentage'], errors='coerce')

print(f"Dataset limpio: {df_clean.shape}")


In [None]:
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
fig.suptitle('Distribuciones Univariadas', fontsize=16)

# Productividad, calificaciones, horas estudio
axes[0,0].hist(df_clean['productivity_score'], bins=30, alpha=0.7, color='skyblue')
axes[0,0].set_title('Productividad')
axes[0,0].set_xlabel('Puntuación')

axes[0,1].hist(df_clean['final_grade'], bins=30, alpha=0.7, color='lightgreen')
axes[0,1].set_title('Calificación Final')
axes[0,1].set_xlabel('Nota (%)')

axes[0,2].hist(df_clean['study_hours_per_day'], bins=30, alpha=0.7, color='orange')
axes[0,2].set_title('Horas Estudio/Día')
axes[0,2].set_xlabel('Horas')

# Variables categóricas
df_clean['gender'].value_counts().plot(kind='bar', ax=axes[1,0])
axes[1,0].set_title('Género')
axes[1,0].set_ylabel('Frecuencia')

sns.boxplot(data=df_clean, y='productivity_score', x='stress_level', ax=axes[1,1])
axes[1,1].set_title('Productividad por Stress')

df_clean['age'].hist(ax=axes[1,2], bins=14, alpha=0.7)
axes[1,2].set_title('Distribución Edad')
axes[1,2].set_xlabel('Años')

plt.tight_layout()
plt.show()
