# 06.03 - Tipos de Datos y Limpieza de Texto

**Autor:** Miguel Angel Vazquez Varela  
**Nivel:** Intermedio  
**Tiempo estimado:** 25 min

---

## Que aprenderemos?

- Convertir tipos de datos
- Limpiar strings (espacios, mayusculas)
- Parsear fechas
- Optimizar memoria con categorias

In [1]:
import pandas as pd
import numpy as np

---

## 1. Convertir tipos numericos

In [2]:
df = pd.DataFrame({
    "id": ["1", "2", "3", "4"],
    "duration": ["12.5", "25", "8.0", "invalid"],
    "distance": ["2,5", "5,0", "1,8", "3,1"]  # Formato europeo
})

print("Tipos originales:")
print(df.dtypes)

Tipos originales:
id          str
duration    str
distance    str
dtype: object


In [3]:
# Convertir a entero
df["id"] = df["id"].astype(int)
df.dtypes

id          int64
duration      str
distance      str
dtype: object

In [4]:
# to_numeric con errores
# errors='coerce' convierte invalidos a NaN
df["duration"] = pd.to_numeric(df["duration"], errors="coerce")
df

Unnamed: 0,id,duration,distance
0,1,12.5,25
1,2,25.0,50
2,3,8.0,18
3,4,,31


In [5]:
# Formato europeo: reemplazar coma por punto
df["distance"] = df["distance"].str.replace(",", ".").astype(float)
df

Unnamed: 0,id,duration,distance
0,1,12.5,2.5
1,2,25.0,5.0
2,3,8.0,1.8
3,4,,3.1


---

## 2. Limpiar strings

In [6]:
stations = pd.DataFrame({
    "name": ["  Sol  ", "ATOCHA", "cibeles", "  Retiro ", "Sol"],
    "zone": ["Centro", "centro", "CENTRO", "Parque", "centro"]
})

stations

Unnamed: 0,name,zone
0,Sol,Centro
1,ATOCHA,centro
2,cibeles,CENTRO
3,Retiro,Parque
4,Sol,centro


In [7]:
# Quitar espacios
stations["name"] = stations["name"].str.strip()
stations

Unnamed: 0,name,zone
0,Sol,Centro
1,ATOCHA,centro
2,cibeles,CENTRO
3,Retiro,Parque
4,Sol,centro


In [8]:
# Normalizar a minusculas
stations["name"] = stations["name"].str.lower()
stations["zone"] = stations["zone"].str.lower()
stations

Unnamed: 0,name,zone
0,sol,centro
1,atocha,centro
2,cibeles,centro
3,retiro,parque
4,sol,centro


In [9]:
# Capitalizar (primera letra mayuscula)
stations["name"] = stations["name"].str.capitalize()
stations["zone"] = stations["zone"].str.capitalize()
stations

Unnamed: 0,name,zone
0,Sol,Centro
1,Atocha,Centro
2,Cibeles,Centro
3,Retiro,Parque
4,Sol,Centro


In [10]:
# Ahora los valores unicos son correctos
print(stations["name"].unique())
print(stations["zone"].unique())

<StringArray>
['Sol', 'Atocha', 'Cibeles', 'Retiro']
Length: 4, dtype: str
<StringArray>
['Centro', 'Parque']
Length: 2, dtype: str


### Otros metodos de string

In [11]:
names = pd.Series(["Station-Sol", "Station-Atocha", "Station-Retiro"])

# Reemplazar
print("Replace:")
print(names.str.replace("Station-", ""))

Replace:
0       Sol
1    Atocha
2    Retiro
dtype: str


In [12]:
# Contiene
print("Contains 'Sol':")
print(names.str.contains("Sol"))

Contains 'Sol':
0     True
1    False
2    False
dtype: bool


In [13]:
# Extraer con regex
print("Extract station name:")
print(names.str.extract(r"Station-(\w+)"))

Extract station name:
        0
0     Sol
1  Atocha
2  Retiro


In [14]:
# Split
print("Split by '-':")
print(names.str.split("-", expand=True))

Split by '-':
         0       1
0  Station     Sol
1  Station  Atocha
2  Station  Retiro


---

## 3. Parsear fechas

In [15]:
dates_df = pd.DataFrame({
    "date_us": ["01/15/2024", "02/20/2024", "03/25/2024"],
    "date_eu": ["15-01-2024", "20-02-2024", "25-03-2024"],
    "date_iso": ["2024-01-15", "2024-02-20", "2024-03-25"],
    "datetime": ["2024-01-15 10:30:00", "2024-02-20 14:45:00", "2024-03-25 08:15:00"]
})

dates_df.dtypes

date_us     str
date_eu     str
date_iso    str
datetime    str
dtype: object

In [16]:
# Formato ISO (automatico)
dates_df["date_iso"] = pd.to_datetime(dates_df["date_iso"])
dates_df["date_iso"]

0   2024-01-15
1   2024-02-20
2   2024-03-25
Name: date_iso, dtype: datetime64[us]

In [17]:
# Formato US: mes/dia/ano
dates_df["date_us"] = pd.to_datetime(dates_df["date_us"], format="%m/%d/%Y")
dates_df["date_us"]

0   2024-01-15
1   2024-02-20
2   2024-03-25
Name: date_us, dtype: datetime64[us]

In [18]:
# Formato EU: dia-mes-ano
dates_df["date_eu"] = pd.to_datetime(dates_df["date_eu"], format="%d-%m-%Y")
dates_df["date_eu"]

0   2024-01-15
1   2024-02-20
2   2024-03-25
Name: date_eu, dtype: datetime64[us]

In [19]:
# Con hora
dates_df["datetime"] = pd.to_datetime(dates_df["datetime"])
dates_df.dtypes

date_us     datetime64[us]
date_eu     datetime64[us]
date_iso    datetime64[us]
datetime    datetime64[us]
dtype: object

In [20]:
# Extraer componentes
dates_df["year"] = dates_df["datetime"].dt.year
dates_df["month"] = dates_df["datetime"].dt.month
dates_df["day"] = dates_df["datetime"].dt.day
dates_df["hour"] = dates_df["datetime"].dt.hour
dates_df["weekday"] = dates_df["datetime"].dt.day_name()

dates_df[["datetime", "year", "month", "day", "hour", "weekday"]]

Unnamed: 0,datetime,year,month,day,hour,weekday
0,2024-01-15 10:30:00,2024,1,15,10,Monday
1,2024-02-20 14:45:00,2024,2,20,14,Tuesday
2,2024-03-25 08:15:00,2024,3,25,8,Monday


---

## 4. Tipo categoria

In [21]:
# Datos con valores repetidos
trips = pd.DataFrame({
    "trip_id": range(1, 10001),
    "station": np.random.choice(["Sol", "Atocha", "Cibeles", "Retiro"], 10000),
    "user_type": np.random.choice(["subscriber", "casual"], 10000)
})

print("Memoria original:")
print(trips.memory_usage(deep=True))

Memoria original:
Index           132
trip_id       80000
station      544999
user_type    570056
dtype: int64


In [22]:
# Convertir a categoria
trips["station"] = trips["station"].astype("category")
trips["user_type"] = trips["user_type"].astype("category")

print("Memoria optimizada:")
print(trips.memory_usage(deep=True))

Memoria optimizada:
Index          132
trip_id      80000
station      10218
user_type    10114
dtype: int64


In [23]:
# Ver categorias
print(trips["station"].cat.categories)

Index(['Atocha', 'Cibeles', 'Retiro', 'Sol'], dtype='str')


In [24]:
# Comparacion de memoria
original = 10000 * 2 * 50  # Aprox bytes para strings
optimized = trips[["station", "user_type"]].memory_usage(deep=True).sum()
print(f"Reduccion: {(1 - optimized/original)*100:.0f}%")

Reduccion: 98%


---

## 5. Pipeline de limpieza completo

In [25]:
# Datos sucios
raw_data = pd.DataFrame({
    "id": ["1", "2", "3", "4", "5"],
    "date": ["15/01/2024", "20/01/2024", "invalid", "25/01/2024", "30/01/2024"],
    "station": ["  SOL  ", "atocha", "CIBELES", "retiro  ", "Sol"],
    "duration": ["12", "25,5", "8", "invalid", "15"]
})

print("Datos originales:")
raw_data

Datos originales:


Unnamed: 0,id,date,station,duration
0,1,15/01/2024,SOL,12
1,2,20/01/2024,atocha,255
2,3,invalid,CIBELES,8
3,4,25/01/2024,retiro,invalid
4,5,30/01/2024,Sol,15


In [26]:
def clean_data(df):
    """Pipeline de limpieza."""
    df = df.copy()
    
    # 1. ID a entero
    df["id"] = df["id"].astype(int)
    
    # 2. Fechas
    df["date"] = pd.to_datetime(df["date"], format="%d/%m/%Y", errors="coerce")
    
    # 3. Limpiar strings
    df["station"] = df["station"].str.strip().str.lower().str.capitalize()
    
    # 4. Numeros con formato europeo
    df["duration"] = df["duration"].str.replace(",", ".")
    df["duration"] = pd.to_numeric(df["duration"], errors="coerce")
    
    # 5. Categoria
    df["station"] = df["station"].astype("category")
    
    return df

clean_df = clean_data(raw_data)
print("\nDatos limpios:")
clean_df


Datos limpios:


Unnamed: 0,id,date,station,duration
0,1,2024-01-15,Sol,12.0
1,2,2024-01-20,Atocha,25.5
2,3,NaT,Cibeles,8.0
3,4,2024-01-25,Retiro,
4,5,2024-01-30,Sol,15.0


In [27]:
print("\nTipos:")
print(clean_df.dtypes)


Tipos:
id                   int64
date        datetime64[us]
station           category
duration           float64
dtype: object


---

## Resumen

| Tarea | Metodo |
|-------|--------|
| String a numero | `pd.to_numeric(errors='coerce')` |
| String a fecha | `pd.to_datetime(format=...)` |
| Quitar espacios | `.str.strip()` |
| Normalizar texto | `.str.lower()`, `.str.capitalize()` |
| Reemplazar | `.str.replace()` |
| Optimizar memoria | `.astype('category')` |

---

**Anterior:** [06.02 - Duplicados y Outliers](06_02_duplicates_outliers.ipynb)  
**Siguiente:** [07.01 - Matplotlib Basico](../07_visualization/07_01_matplotlib_basics.ipynb)