## 🎯 Infraestructura: caso práctico de limpieza y unificación de datasets

**📅 Fecha:** 2025

## **Librerias**

In [165]:
import pandas as pd
import numpy as np
import csv
import csv
import matplotlib.pyplot as plt
from IPython.display import display
from IPython.display import display, Markdown
from thefuzz import process
pd.set_option('future.no_silent_downcasting', True)

## **Carga de datos**

In [166]:
data = "C:/Users/mafab/Documents/Curso_Infraestructura/data/data.txt"
data1 = "C:/Users/mafab/Documents/Curso_Infraestructura/data/data1.txt"
data3 = "C:/Users/mafab/Documents/Curso_Infraestructura/data/data3.txt"

In [167]:
def procesar_archivo(data):
    rows = []
    with open(data, 'r', encoding='utf-8') as f:
        reader = csv.reader(f)
        for row in reader:
            rows.append(row)

    # Crear DataFrame aunque las filas tengan distinto largo, llenando con None
    max_cols = max(len(r) for r in rows)
    for r in rows:
        while len(r) < max_cols:
            r.append(None)

    df = pd.DataFrame(rows[1:], columns=rows[0])
    return df

## **Inspección de los dfs**

In [168]:
df = procesar_archivo(data)
print("Las dimensiones del archivo son :", df.shape)
print("="*40)
print(df.info())

Las dimensiones del archivo son : (20, 8)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          19 non-null     object
 1    Age          19 non-null     object
 2    Height (cm)  19 non-null     object
 3    Weight (kg)  19 non-null     object
 4    Salary ($)   19 non-null     object
 5    City         19 non-null     object
 6   None          3 non-null      object
 7   None          2 non-null      object
dtypes: object(8)
memory usage: 1.4+ KB
None


In [169]:
df1 = procesar_archivo(data1)
print("Las dimensiones del archivo 2 son :", df1.shape)
print("="*40)
print(df.info())

Las dimensiones del archivo 2 son : (10, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          19 non-null     object
 1    Age          19 non-null     object
 2    Height (cm)  19 non-null     object
 3    Weight (kg)  19 non-null     object
 4    Salary ($)   19 non-null     object
 5    City         19 non-null     object
 6   None          3 non-null      object
 7   None          2 non-null      object
dtypes: object(8)
memory usage: 1.4+ KB
None


In [170]:
df3 = procesar_archivo(data3)
print("Las dimensiones del archivo 3 son :", df3.shape)
print("="*40)
print(df.info())

Las dimensiones del archivo 3 son : (29, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Name          19 non-null     object
 1    Age          19 non-null     object
 2    Height (cm)  19 non-null     object
 3    Weight (kg)  19 non-null     object
 4    Salary ($)   19 non-null     object
 5    City         19 non-null     object
 6   None          3 non-null      object
 7   None          2 non-null      object
dtypes: object(8)
memory usage: 1.4+ KB
None


## **Prepración y limpieza de datos**

### **Primer archivo = DF**

#### Eliminar espacios en los nombres de las columnas

In [171]:
def eliminar_espacios_columnas(df):
    df.columns = df.columns.str.strip()
    return df
df = eliminar_espacios_columnas(df)

#### Eliminación de comillas dobles, comas, espacios e.t.c

In [172]:
def limpiar_valores(df):
    df = df.map(lambda x: str(x).replace('"', '').replace("'", '').strip() if pd.notnull(x) else x)
    df = df.replace(r'^\s*$', np.nan, regex=True)
    return df
df = limpiar_valores(df)

#### Reemplazar texto en columnas que deberían ser numéricas

In [173]:
# Lista para agregar texto a reemplazar
text_to_num = {'Twenty-Five': 25}

df['Age'] = df['Age'].replace(text_to_num)

In [174]:
display(df)

Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City,None,None.1
0,Alice,30.0,165.5,60.2,50000.0,New York,,
1,Bob,25.0,175.0,75.0,60000.0,Los Angeles,,
2,Charlie,32.0,170.5,,55000.0,Chicago,,
3,David,26.0,180.0,85.5,59000.0,San Francisco,,
4,Eve,25.0,160.0,58.0,52000.0,Bristol,,
5,Frank,35.0,175.5,70.0,,Washington,D.C.,
6,Grace,29.0,,65.0,51000.0,Huston,,
7,Heidi,28.0,168.0,0.0,55000.0,Houston,,
8,Ivan,34.0,185.0,95.0,68000.0,Miani,,
9,Jack,27.0,172.5,70.5,54000.0,Boston,,


#### Eliminación de columna irrelevantes

* **Nota:** se procede a eliminar tanto las filas como las columnas con valores iguales de NONE

In [175]:
df = df.dropna(axis=0, how='all')
df = df.iloc[:, :-2]
df.tail(5)

Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
14,Oliver,30.0,173.0,74.0,58000.0,New York
15,Penny,28.0,169.5,63.5,56000.0,Los Angeles
16,Quinn,,172.0,77.5,57000.0,
17,Ryan,150.0,181.0,92.0,,Chicago
18,Sophia,33.0,167.5,62.0,62000.0,Miami


#### Identificación de valores faltantes

In [176]:
def resumen_nulos(df):
  Conteo = df.isnull().sum()
  Porcentaje = df.isnull().mean()*100
  resumen_nulos = pd.DataFrame({'Conteo': Conteo, 'Porcentaje': Porcentaje})
  resumen_nulos.sort_values(by='Porcentaje', ascending=False)
  return resumen_nulos

#### Imputación de valores faltantes

In [177]:
imputar_mediana = ['Age', 'Height (cm)', 'Weight (kg)', 'Salary ($)']
for columna in imputar_mediana:
  mediana =  df[columna].astype(float).median()
  df[columna] =  df[columna].fillna(mediana)

In [178]:
imputar_moda = ['City']
for columna in imputar_moda:
  moda = df[columna].mode()[0]
  df[columna] = df[columna].fillna(moda)

#### Cambiar tipo de formato variables

In [179]:
columnas_a_int = ['Age', 'Salary ($)']
for columna in columnas_a_int:
    df[columna] = pd.to_numeric(df[columna], errors='coerce').astype(int)
    print(f"Columna '{columna}' convertida a tipo numérico (con NaNs para errores).")

Columna 'Age' convertida a tipo numérico (con NaNs para errores).
Columna 'Salary ($)' convertida a tipo numérico (con NaNs para errores).


In [180]:
columnas_a_float = ['Height (cm)', 'Weight (kg)']
for columna in columnas_a_float:
    df[columna] = pd.to_numeric(df[columna], errors='coerce')
    print(f"Columna '{columna}' convertida a tipo flotante (con NaNs para errores).")

Columna 'Height (cm)' convertida a tipo flotante (con NaNs para errores).
Columna 'Weight (kg)' convertida a tipo flotante (con NaNs para errores).


In [181]:
df["Height (cm)"] = df["Height (cm)"].round(2)
df["Weight (kg)"] = df["Weight (kg)"].round(2)

In [182]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 19 entries, 0 to 18
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         19 non-null     object 
 1   Age          19 non-null     int64  
 2   Height (cm)  19 non-null     float64
 3   Weight (kg)  19 non-null     float64
 4   Salary ($)   19 non-null     int64  
 5   City         19 non-null     object 
dtypes: float64(2), int64(2), object(2)
memory usage: 1.0+ KB


#### Convertir a minusculas las variables tipo object

In [183]:
for col in df.select_dtypes(include='object').columns:
  df[col] = df[col].str.lower()

#### Normalización de datos

In [184]:
df['City'].value_counts()

City
chicago           4
new york          2
los angeles       2
san francisco     1
bristol           1
washington        1
huston            1
houston           1
miani             1
boston            1
seattle           1
san diego         1
san  francisco    1
miami             1
Name: count, dtype: int64

In [185]:
# Opción 1 para corregir los problemas de digitación en las ciudades
# correcciones = {"miani": "miami", "huston" : "houston"}
# df['City'] = df['City'].replace(correcciones)
# df['City'] = df['City'].apply(correct_city)

In [186]:
# Opción 2 para corregir los problemas de digitación en las ciudades: Versión basada en threshold del score, este se puede ajustar para aceptar más correciones automáticas. Sin embargo, es necesario estar pendiente porque
#entre más bajo el score más correciones acpeta y se puede tener “falsos positivos”

unique_cities = df['City'].unique()
standard_cities = ['chicago', 'new york', 'los angeles', 'san francisco', 'boston', 'houston', 'miami', 'seattle', 'san diego', 'washington', 'bristol']

corrections = {}
for city in df['City'].unique():
    match, score = process.extractOne(city, standard_cities)
    corrections[city] = match if score > 70 else city

# Revisa el diccionario corrections y ajusta manualmente solo lo que sea necesario,
# por ejemplo, cambia valores erróneos o deja sin cambiar

df['City'] = df['City'].replace(corrections)

In [187]:
df['City'].value_counts()

City
chicago          4
new york         2
los angeles      2
san francisco    2
houston          2
miami            2
bristol          1
washington       1
boston           1
seattle          1
san diego        1
Name: count, dtype: int64

### **Segundo archivo = DF1**

#### Información general dataset

In [188]:
print(df1.shape)
print ("-"*40)
df1.info()

(10, 7)
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Name                10 non-null     object
 1    Age                10 non-null     object
 2    "Height (inches)"  10 non-null     object
 3    "Weight (pounds)"  10 non-null     object
 4    "Salary ($)"       10 non-null     object
 5    "City"             10 non-null     object
 6   None                1 non-null      object
dtypes: object(7)
memory usage: 692.0+ bytes


In [189]:
display(df1)

Unnamed: 0,Name,Age,"""Height (inches)""","""Weight (pounds)""","""Salary ($)""","""City""",None
0,Amy,30,65.5,132.3,50000,"""New York""",
1,Ben,25,68.0,165.3,60000,"""Los Angeles""",
2,Charlie,32,67.0,180.5,55000,"""Chicago""",
3,David,26,70.0,188.7,59000,"""San Francisco""",
4,Ella,28,61.0,126.0,52000,"""Bristol""",
5,Frank,35,69.0,154.3,58000,"""Washington","D.C."""
6,Grace,29,64.0,143.5,51000,"""Houston""",
7,Henry,28,66.1,110.2,55000,"""Houston""",
8,Ivy,34,72.0,209.0,68000,"""Miami""",
9,Jack,27,68.0,155.5,54000,"""Boston""",


#### Eliminar espacios en los nombres de las columnas

In [190]:
df1 = eliminar_espacios_columnas(df1)
df1.columns = df1.columns.str.replace('"', '')

#### Eliminación de comillas dobles, comas, espacios e.t.c

In [191]:
df1 = limpiar_valores(df1)

#### Eliminación de columna irrelevantes

In [192]:
df1 = df1.iloc[:, :-1]
df1.tail(5)

Unnamed: 0,Name,Age,Height (inches),Weight (pounds),Salary ($),City
5,Frank,35,69.0,154.3,58000,Washington
6,Grace,29,64.0,143.5,51000,Houston
7,Henry,28,66.1,110.2,55000,Houston
8,Ivy,34,72.0,209.0,68000,Miami
9,Jack,27,68.0,155.5,54000,Boston


#### Identificación de valores faltantes

In [193]:
resumen_nulos(df1)

Unnamed: 0,Conteo,Porcentaje
Name,0,0.0
Age,0,0.0
Height (inches),0,0.0
Weight (pounds),0,0.0
Salary ($),0,0.0
City,0,0.0


In [194]:
print(df1["Height (inches)"].dtype)
print(df1["Weight (pounds)"].dtype)

object
object


#### Cambiar tipo de formato variables

In [195]:
columnas_a_int = ['Age', 'Salary ($)']
for columna in columnas_a_int:
    df1[columna] = pd.to_numeric(df1[columna], errors='coerce').astype(int)
    print(f"Columna '{columna}' convertida a tipo numérico (con NaNs para errores).")

Columna 'Age' convertida a tipo numérico (con NaNs para errores).
Columna 'Salary ($)' convertida a tipo numérico (con NaNs para errores).


In [196]:
columnas_a_float = ['Height (inches)', 'Weight (pounds)']
for columna in columnas_a_float:
    df1[columna] = pd.to_numeric(df1[columna], errors='coerce')
    print(f"Columna '{columna}' convertida a tipo flotante (con NaNs para errores).")

Columna 'Height (inches)' convertida a tipo flotante (con NaNs para errores).
Columna 'Weight (pounds)' convertida a tipo flotante (con NaNs para errores).


In [197]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Name             10 non-null     object 
 1   Age              10 non-null     int64  
 2   Height (inches)  10 non-null     float64
 3   Weight (pounds)  10 non-null     float64
 4   Salary ($)       10 non-null     int64  
 5   City             10 non-null     object 
dtypes: float64(2), int64(2), object(2)
memory usage: 612.0+ bytes


#### Convertir las variables Height y Weight a (cm) y (kg) respectivamente

In [198]:
df1.columns

Index(['Name', 'Age', 'Height (inches)', 'Weight (pounds)', 'Salary ($)',
       'City'],
      dtype='object')

In [199]:
# Obtener los índices de las columnas originales
height_inches_index = df1.columns.get_loc('Height (inches)')
weight_pounds_index = df1.columns.get_loc('Weight (pounds)')

# Convertir altura de pulgadas a cm (1 pulgada = 2.54 cm)
df1['Height (cm)'] = df1['Height (inches)'] * 2.54

# Convertir peso de libras a kg (1 libra = 0.453592 kg)
df1['Weight (kg)'] = df1['Weight (pounds)'] * 0.453592

# Eliminar las columnas originales
df1 = df1.drop(columns=['Height (inches)', 'Weight (pounds)'])

# Insertar las nuevas columnas en las posiciones originales
df1.insert(height_inches_index, 'Height (cm)', df1.pop('Height (cm)'))
df1.insert(weight_pounds_index, 'Weight (kg)', df1.pop('Weight (kg)'))

# Redondear para evitar problemas de precisión
df1["Height (cm)"] = df1["Height (cm)"].round(2)
df1["Weight (kg)"] = df1["Weight (kg)"].round(2)

# Mostrar información actualizada del dataframe para ver los cambios
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Name         10 non-null     object 
 1   Age          10 non-null     int64  
 2   Height (cm)  10 non-null     float64
 3   Weight (kg)  10 non-null     float64
 4   Salary ($)   10 non-null     int64  
 5   City         10 non-null     object 
dtypes: float64(2), int64(2), object(2)
memory usage: 612.0+ bytes


#### Convertir a minusculas las variables tipo object

In [200]:
for col in df1.select_dtypes(include='object').columns:
  df1[col] = df1[col].str.lower()

In [201]:
display(df1)

Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
0,amy,30,166.37,60.01,50000,new york
1,ben,25,172.72,74.98,60000,los angeles
2,charlie,32,170.18,81.87,55000,chicago
3,david,26,177.8,85.59,59000,san francisco
4,ella,28,154.94,57.15,52000,bristol
5,frank,35,175.26,69.99,58000,washington
6,grace,29,162.56,65.09,51000,houston
7,henry,28,167.89,49.99,55000,houston
8,ivy,34,182.88,94.8,68000,miami
9,jack,27,172.72,70.53,54000,boston


### **Tercer archivo = DF3**

#### Información general dataset

In [202]:
print(df3.shape)
print ("-"*40)
df3.info()

(29, 4)
----------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29 entries, 0 to 28
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Name               29 non-null     object
 1    Sex               29 non-null     object
 2    "Marital Status"  29 non-null     object
 3    Career            29 non-null     object
dtypes: object(4)
memory usage: 1.0+ KB


In [203]:
display(df3)

Unnamed: 0,Name,Sex,"""Marital Status""",Career
0,Alice,Female,Single,Data Scientist
1,Bob,Male,Single,Software Engineer
2,Charlie,Male,Married,Doctor
3,David,Male,Single,Lawyer
4,Eve,Female,Divorced,Artist
5,Frank,Male,Married,Engineer
6,Grace,Female,Single,Nurse
7,Heidi,Female,Married,Teacher
8,Ivan,Male,Married,Accountant
9,Jack,Male,Single,IT Specialist


#### Eliminar espacios en los nombres de las columnas

In [204]:
df3 = eliminar_espacios_columnas(df3)
df3.columns = df3.columns.str.replace('"', '')

#### Eliminación de comillas dobles, comas, espacios e.t.c

In [205]:
df3 = limpiar_valores(df3)

#### Identificación de valores faltantes

In [206]:
resumen_nulos(df3)

Unnamed: 0,Conteo,Porcentaje
Name,0,0.0
Sex,0,0.0
Marital Status,0,0.0
Career,0,0.0


#### Convertir a minusculas las variables tipo object

In [207]:
for col in df3.select_dtypes(include='object').columns:
  df3[col] = df3[col].str.lower()

#### Verificación de datos de las columnas tipo object para normalización

In [208]:
variables = ['Sex', 'Marital Status', 'Career']
for variable in variables:
    print(f"Conteos para {variable}:")
    print(df3[variable].value_counts())
    print("="*40)

Conteos para Sex:
Sex
male          15
female        13
non-binary     1
Name: count, dtype: int64
Conteos para Marital Status:
Marital Status
single      16
married     11
divorced     2
Name: count, dtype: int64
Conteos para Career:
Career
nurse                3
artist               2
lawyer               2
engineer             2
doctor               2
accountant           2
it specialist        2
software engineer    1
data scientist       1
teacher              1
marketing            1
researcher           1
data analyst         1
architect            1
graphic designer     1
writer               1
financial analyst    1
psychologist         1
sales manager        1
marketing manager    1
professor            1
Name: count, dtype: int64


In [209]:
display(df3)

Unnamed: 0,Name,Sex,Marital Status,Career
0,alice,female,single,data scientist
1,bob,male,single,software engineer
2,charlie,male,married,doctor
3,david,male,single,lawyer
4,eve,female,divorced,artist
5,frank,male,married,engineer
6,grace,female,single,nurse
7,heidi,female,married,teacher
8,ivan,male,married,accountant
9,jack,male,single,it specialist


## **Unificación de dataset**

#### **Nota**: Primero se unifica el df con el df1, ya que tienen la misma estructura

In [210]:
display(df)

Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
0,alice,30,165.5,60.2,50000,new york
1,bob,25,175.0,75.0,60000,los angeles
2,charlie,32,170.5,67.5,55000,chicago
3,david,26,180.0,85.5,59000,san francisco
4,eve,25,160.0,58.0,52000,bristol
5,frank,35,175.5,70.0,56000,washington
6,grace,29,171.25,65.0,51000,houston
7,heidi,28,168.0,0.0,55000,houston
8,ivan,34,185.0,95.0,68000,miami
9,jack,27,172.5,70.5,54000,boston


In [211]:
display(df1)

Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
0,amy,30,166.37,60.01,50000,new york
1,ben,25,172.72,74.98,60000,los angeles
2,charlie,32,170.18,81.87,55000,chicago
3,david,26,177.8,85.59,59000,san francisco
4,ella,28,154.94,57.15,52000,bristol
5,frank,35,175.26,69.99,58000,washington
6,grace,29,162.56,65.09,51000,houston
7,henry,28,167.89,49.99,55000,houston
8,ivy,34,182.88,94.8,68000,miami
9,jack,27,172.72,70.53,54000,boston


In [212]:
df_1 = pd.concat([df,df1], axis=0, join="outer", ignore_index=True)
print(df_1.shape)
print("="*40)
display(df_1)

(29, 6)


Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
0,alice,30,165.5,60.2,50000,new york
1,bob,25,175.0,75.0,60000,los angeles
2,charlie,32,170.5,67.5,55000,chicago
3,david,26,180.0,85.5,59000,san francisco
4,eve,25,160.0,58.0,52000,bristol
5,frank,35,175.5,70.0,56000,washington
6,grace,29,171.25,65.0,51000,houston
7,heidi,28,168.0,0.0,55000,houston
8,ivan,34,185.0,95.0,68000,miami
9,jack,27,172.5,70.5,54000,boston


#### Verificación de duplicados en la columna Name

In [213]:
# Verificar filas duplicadas basadas en la columna 'Name'
duplicates_by_name = df_1[df_1.duplicated(subset=['Name'], keep=False)]

# Mostrar las filas duplicadas
print("Entradas duplicadas basadas en Nombre:")
display(duplicates_by_name.sort_values(by='Name'))

Entradas duplicadas basadas en Nombre:


Unnamed: 0,Name,Age,Height (cm),Weight (kg),Salary ($),City
2,charlie,32,170.5,67.5,55000,chicago
21,charlie,32,170.18,81.87,55000,chicago
3,david,26,180.0,85.5,59000,san francisco
22,david,26,177.8,85.59,59000,san francisco
5,frank,35,175.5,70.0,56000,washington
24,frank,35,175.26,69.99,58000,washington
6,grace,29,171.25,65.0,51000,houston
25,grace,29,162.56,65.09,51000,houston
9,jack,27,172.5,70.5,54000,boston
28,jack,27,172.72,70.53,54000,boston


#### Se realiza el merge entre df_1 y df3

In [214]:
df_1.columns

Index(['Name', 'Age', 'Height (cm)', 'Weight (kg)', 'Salary ($)', 'City'], dtype='object')

In [215]:
df3.columns

Index(['Name', 'Sex', 'Marital Status', 'Career'], dtype='object')

In [216]:
df_final = df3.merge(df_1, on ='Name', how = "left")
df_final

Unnamed: 0,Name,Sex,Marital Status,Career,Age,Height (cm),Weight (kg),Salary ($),City
0,alice,female,single,data scientist,30,165.5,60.2,50000,new york
1,bob,male,single,software engineer,25,175.0,75.0,60000,los angeles
2,charlie,male,married,doctor,32,170.5,67.5,55000,chicago
3,charlie,male,married,doctor,32,170.18,81.87,55000,chicago
4,david,male,single,lawyer,26,180.0,85.5,59000,san francisco
5,david,male,single,lawyer,26,177.8,85.59,59000,san francisco
6,eve,female,divorced,artist,25,160.0,58.0,52000,bristol
7,frank,male,married,engineer,35,175.5,70.0,56000,washington
8,frank,male,married,engineer,35,175.26,69.99,58000,washington
9,grace,female,single,nurse,29,171.25,65.0,51000,houston


#### Se eliminan duplicados del df_final

In [217]:
df_final = df_final.drop_duplicates()
df_final

Unnamed: 0,Name,Sex,Marital Status,Career,Age,Height (cm),Weight (kg),Salary ($),City
0,alice,female,single,data scientist,30,165.5,60.2,50000,new york
1,bob,male,single,software engineer,25,175.0,75.0,60000,los angeles
2,charlie,male,married,doctor,32,170.5,67.5,55000,chicago
3,charlie,male,married,doctor,32,170.18,81.87,55000,chicago
4,david,male,single,lawyer,26,180.0,85.5,59000,san francisco
5,david,male,single,lawyer,26,177.8,85.59,59000,san francisco
6,eve,female,divorced,artist,25,160.0,58.0,52000,bristol
7,frank,male,married,engineer,35,175.5,70.0,56000,washington
8,frank,male,married,engineer,35,175.26,69.99,58000,washington
9,grace,female,single,nurse,29,171.25,65.0,51000,houston


#### Se eliminan duplicados por Name y se ordenan alfabeticamente

In [218]:
df_final = df_final.drop_duplicates(["Name"])
df_final = df_final.sort_values(by="Name")
df_final

Unnamed: 0,Name,Sex,Marital Status,Career,Age,Height (cm),Weight (kg),Salary ($),City
0,alice,female,single,data scientist,30,165.5,60.2,50000,new york
24,amy,female,married,sales manager,30,166.37,60.01,50000,new york
25,ben,male,single,marketing manager,25,172.72,74.98,60000,los angeles
1,bob,male,single,software engineer,25,175.0,75.0,60000,los angeles
2,charlie,male,married,doctor,32,170.5,67.5,55000,chicago
4,david,male,single,lawyer,26,180.0,85.5,59000,san francisco
30,ella,female,divorced,artist,28,154.94,57.15,52000,bristol
6,eve,female,divorced,artist,25,160.0,58.0,52000,bristol
7,frank,male,married,engineer,35,175.5,70.0,56000,washington
9,grace,female,single,nurse,29,171.25,65.0,51000,houston
