# Curso de manejo de datos faltantes: Imputacion

#### Importar las librerias de trabajo para este curso

In [11]:
import janitor
import matplotlib.pyplot as plt
import missingno
import nhanes.load
import numpy as np
import pandas as pd
import scipy.stats
import seaborn as sns
import session_info
import sklearn.compose
import sklearn.impute
import sklearn.preprocessing
import statsmodels.api as sm
import statsmodels.datasets
import statsmodels.formula.api as smf

from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_iterative_imputer
from sklearn.kernel_approximation import Nystroem
from sklearn.linear_model import BayesianRidge, Ridge
from sklearn.neighbors import KNeighborsRegressor
from statsmodels.graphics.mosaicplot import mosaic

## Importar funciones personalizadas

In [12]:
# Install the missing nbformat module
%pip install nbformat

# Run the pandas-missing-extension notebook
%run pandas-missing-extension.ipynb

Note: you may need to restart the kernel to use updated packages.


## Configurar el aspecto general de las graficas del proyecto

In [13]:
%matplotlib inline

sns.set(
    rc={
        "figure.figsize": (8, 6)
    }
)

sns.set_style("whitegrid")

## El problema de trabajar con valores faltante

In [14]:
airquality_df = (
    sm.datasets.get_rdataset("airquality")
    .data
    .clean_names(
        case_type="snake"
    )
    .add_column("year", 1973)
    .assign(
        date=lambda df: pd.to_datetime(df[["year", "month", "day"]])
    )
    .sort_values(by="date")
    .set_index("date")
)

airquality_df

  return method(self._obj, *args, **kwargs)


Unnamed: 0_level_0,ozone,solar_r,wind,temp,month,day,year
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1973-05-01,41.0,190.0,7.4,67,5,1,1973
1973-05-02,36.0,118.0,8.0,72,5,2,1973
1973-05-03,12.0,149.0,12.6,74,5,3,1973
1973-05-04,18.0,313.0,11.5,62,5,4,1973
1973-05-05,,,14.3,56,5,5,1973
...,...,...,...,...,...,...,...
1973-09-26,30.0,193.0,6.9,70,9,26,1973
1973-09-27,,145.0,13.2,77,9,27,1973
1973-09-28,14.0,191.0,14.3,75,9,28,1973
1973-09-29,18.0,131.0,8.0,76,9,29,1973


In [15]:
airquality_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 153 entries, 1973-05-01 to 1973-09-30
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ozone    116 non-null    float64
 1   solar_r  146 non-null    float64
 2   wind     153 non-null    float64
 3   temp     153 non-null    int64  
 4   month    153 non-null    int64  
 5   day      153 non-null    int64  
 6   year     153 non-null    int64  
dtypes: float64(3), int64(4)
memory usage: 9.6 KB


In [16]:
missing_values = airquality_df.isna().sum()
print(missing_values)

ozone      37
solar_r     7
wind        0
temp        0
month       0
day         0
year        0
dtype: int64


In [17]:
(
    smf.ols(
        formula="temp ~ ozone",
        data=airquality_df
    )
    .fit()
    .summary()
    .tables[0]
)

0,1,2,3
Dep. Variable:,temp,R-squared:,0.488
Model:,OLS,Adj. R-squared:,0.483
Method:,Least Squares,F-statistic:,108.5
Date:,"Fri, 09 May 2025",Prob (F-statistic):,2.93e-18
Time:,19:18:34,Log-Likelihood:,-386.27
No. Observations:,116,AIC:,776.5
Df Residuals:,114,BIC:,782.1
Df Model:,1,,
Covariance Type:,nonrobust,,
