In [73]:
# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# ------------------------------------------------------------------------------
from scipy.stats import shapiro, kstest

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")

In [74]:
df = pd.read_csv("raw_data_limpio.csv") 

In [75]:
df.head(3)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeecount,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,over18,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearsincurrentrole,yearssincelastpromotion,yearswithcurrmanager,sameasmonthlyincome,datebirth,salary,roledepartament,numberchildren,remotework
0,51,no,non-travel,2015.72,,6,3,,1,1,1,0,,3,5,research director,3,,16280.83,42330.17,7,y,no,13,3.0,3,full time,0,,5,3.0,20,,15,15,16280.83,1972,195370.0,,,yes
1,52,no,non-travel,2063.39,,1,4,life sciences,1,2,3,0,,2,5,manager,3,,,43331.17,0,,,14,3.0,1,,1,34.0,5,3.0,33,,11,9,,1971,199990.0,,,1
2,42,no,travel_rarely,1984.25,research & development,4,2,technical degree,1,3,3,0,,3,5,manager,4,married,,41669.33,1,,no,11,3.0,4,,0,22.0,3,,22,,11,15,,1981,192320.0,manager - research & development,,1


**Transformamos: quitamos $ y cambiamos "," por "."**


`sameasmonthlyincome`, `salary`, `monthlyincome`, `monthlyrate`, `performancerating`, `worklifebalance`, `totalworkingyears`, `yearsincurrentrole`

In [40]:
# Lista de columnas a transformar
lista_col = [
    "salary", "monthlyincome", "monthlyrate",
    "performancerating", "worklifebalance", "totalworkingyears", 
]

# Lista para guardar errores
lista_errores = []

# Función para limpiar y convertir a float
def cambiar_comas(dato):
    if pd.isna(dato):
        return np.nan

    if not isinstance(dato, str):
        return float(dato)

    dato_limpio = dato.replace("$", "").replace(",", ".").strip()
    
    if dato_limpio == "":
        return np.nan

    try:
        return float(dato_limpio)
    except:
        lista_errores.append(dato)
        return np.nan




In [41]:
for col in lista_col:
    if col in df.columns:
        df[col] = df[col].apply(cambiar_comas)


In [42]:
df[["salary", "monthlyincome", "monthlyrate", "performancerating", "worklifebalance", "totalworkingyears"]]

Unnamed: 0,salary,monthlyincome,monthlyrate,performancerating,worklifebalance,totalworkingyears
0,195370.00,16280.83,42330.17,3.0,3.0,
1,199990.00,,43331.17,3.0,3.0,34.0
2,192320.00,,41669.33,3.0,,22.0
3,171690.00,14307.50,37199.50,3.0,,
4,,12783.92,33238.20,3.0,3.0,
...,...,...,...,...,...,...
1673,,3949.17,10267.83,3.0,3.0,
1674,191324.62,15943.72,41453.67,3.0,3.0,27.0
1675,28111.13,,6090.75,3.0,3.0,6.0
1676,100071.84,8339.32,21682.23,,3.0,


**Transformamos: redondeamos a 2 decimales `hourlyrate`, `dailyrate`**

In [43]:
# Lista de columnas a redondear
columnas_a_redondear = ["hourlyrate", "dailyrate"]

# Función segura para redondear
def redondear_dos_decimales(valor):
    if pd.isna(valor):
        return np.nan  # conserva NaN o None sin error
    
    try:
        return round(float(valor), 2)
    except:
        return np.nan

# Aplicar a todas las columnas de la lista
for col in columnas_a_redondear:
    if col in df.columns:
        df[col] = df[col].apply(redondear_dos_decimales)


In [44]:
df[["dailyrate", "hourlyrate"]]

Unnamed: 0,dailyrate,hourlyrate
0,2015.72,
1,2063.39,
2,1984.25,
3,1771.40,
4,1582.77,
...,...,...
1673,488.94,
1674,1973.98,
1675,290.04,
1676,1032.49,


**Calculamos y rellenamos las columnas ``salary``, ``hourlyrate`` y ``monthlyincome``**

In [45]:
# Rellenar hourlyrate donde esté vacío usando dailyrate / 8
df["hourlyrate"] = df["hourlyrate"].fillna(df["dailyrate"] / 8)


In [46]:
# Rellenar salary donde esté vacío usando monthlyincome * 12
df["salary"] = df["salary"].fillna(df["monthlyincome"] * 12)

In [47]:
# Rellenar monthlyincome donde esté vacío usando salary / 12
df["monthlyincome"] = df["monthlyincome"].fillna(df["salary"] / 12)

**Transformamos: modificamos columnas a minuscula**


`"department`, `educationfield`, `attrition`, `jobrole`, `maritalstatus`, `over18`, `overtime`, `standardhours`, `roledepartament`, `remotework`

In [48]:
cambios_a_minuscula = ["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]

In [49]:
for col in cambios_a_minuscula:
    df[col] = df[col].apply(lambda dato: dato.lower() if type(dato) == str else dato)

In [50]:
df[["department", "educationfield", "attrition", "jobrole", "maritalstatus", "over18", "overtime", "standardhours", "roledepartament", "remotework"]]

Unnamed: 0,department,educationfield,attrition,jobrole,maritalstatus,over18,overtime,standardhours,roledepartament,remotework
0,,,no,research director,,y,no,full time,,yes
1,,life sciences,no,manager,,,,,,1
2,research & development,technical degree,no,manager,married,,no,,manager - research & development,1
3,,medical,no,research director,married,y,,full time,,false
4,,technical degree,no,sales executive,divorced,y,no,,,0
...,...,...,...,...,...,...,...,...,...,...
1673,,medical,no,research scientist,single,,,,,yes
1674,,,no,manager,married,y,no,full time,,false
1675,,,no,research scientist,,,no,part time,,false
1676,,life sciences,no,manufacturing director,divorced,,yes,part time,,yes


**Transformamos: espaciado sobrante en valores de la columna ``department``, ``jobrole`` y ``educationfield``**

In [51]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print(df['educationfield'].unique())

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


In [52]:
def sin_espaciado_extra(series):
    return series.str.strip().str.replace(r'\s+', ' ', regex=True)

columnas_a_limpiar = ['department', 'jobrole', 'educationfield']  # Ejemplo de columnas

for c in columnas_a_limpiar:

    if c in df.columns:  
        df[c] = sin_espaciado_extra(df[c])


In [53]:
print(df['department'].unique())
print('-------------')
print(df['jobrole'].unique())
print('-------------')
print(df['educationfield'].unique())

# no nos hace falta limpiar 'roledepartment' porque está destinada a eliminación

[nan 'research & development' 'sales' 'human resources']
-------------
['research director' 'manager' 'sales executive' 'manufacturing director'
 'research scientist' 'healthcare representative' 'laboratory technician'
 'sales representative' 'human resources']
-------------
[nan 'life sciences' 'technical degree' 'medical' 'other' 'marketing'
 'human resources']


**Transformamos: añadimos información a columna``department`` con información de ``jobrole``**

In [54]:
keywords = {
    "sales": ["sales"],
    "human resources": ["human"],
    "research & development": ["laboratory", "manufacturing", "research", "healthcare"]
}


In [55]:
def infer_department(jobrole, keywords_dict):
    if pd.isna(jobrole):
        return np.nan

    for dept, words in keywords_dict.items():
        for w in words:
            if w in jobrole:
                return dept

    return np.nan  # si no coincide con ninguna de las 3

In [56]:
df["department"] = df["jobrole"].apply(lambda x: infer_department(x, keywords))

In [57]:
df[["department", "jobrole", "roledepartament", "joblevel"]].sample(20)

Unnamed: 0,department,jobrole,roledepartament,joblevel
351,research & development,healthcare representative,,2
333,human resources,human resources,,3
94,research & development,healthcare representative,,3
857,research & development,research scientist,,1
1293,research & development,manufacturing director,,2
100,sales,sales representative,,1
1086,,manager,,5
169,,manager,,5
1075,research & development,research scientist,research scientist - research & development,2
1612,sales,sales executive,,2


**Transformamos: columna `maritalstatus` 1 dato mal escritos**

In [58]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

In [59]:
df["maritalstatus"] = df["maritalstatus"].replace("marreid", "married")

In [60]:
df["maritalstatus"].value_counts()

maritalstatus
married     455
single      343
divorced    205
Name: count, dtype: int64

**REPLACE `maritalstatus`, `overtime`, `department` - NaN por "unknown"**

In [61]:
df = df.fillna({'maritalstatus': 'unknown', 'overtime': 'unknown', 'department': 'unknown'})

In [62]:
df[["maritalstatus", "overtime", "department"]].value_counts().reset_index()

Unnamed: 0,maritalstatus,overtime,department,count
0,unknown,no,research & development,187
1,unknown,unknown,research & development,165
2,married,no,research & development,123
3,married,unknown,research & development,120
4,single,no,research & development,89
5,unknown,unknown,sales,87
6,single,unknown,research & development,84
7,unknown,no,sales,65
8,married,unknown,sales,60
9,divorced,no,research & development,59


**Transformamos: valores NaN por 'non-travel' en columna ``businesstravel`` basados en la información que nos ha proporcionado nuestro enlace con el proyecto**

In [63]:
df["businesstravel"] = df["businesstravel"].fillna("non-travel")

In [64]:
df['businesstravel'].value_counts()

businesstravel
non-travel           894
travel_rarely        616
travel_frequently    168
Name: count, dtype: int64

**DROP Columns - `employeecount`, `sameasmonthlyincome`,`numberchildren`, `over18`, `yearsincurrentrole`**

In [65]:
df = df.drop(['employeecount', 'sameasmonthlyincome', 'numberchildren', 'over18', 'yearsincurrentrole','roledepartament'], axis=1)

In [66]:
df.head(2)

Unnamed: 0,age,attrition,businesstravel,dailyrate,department,distancefromhome,education,educationfield,employeenumber,environmentsatisfaction,gender,hourlyrate,jobinvolvement,joblevel,jobrole,jobsatisfaction,maritalstatus,monthlyincome,monthlyrate,numcompaniesworked,overtime,percentsalaryhike,performancerating,relationshipsatisfaction,standardhours,stockoptionlevel,totalworkingyears,trainingtimeslastyear,worklifebalance,yearsatcompany,yearssincelastpromotion,yearswithcurrmanager,datebirth,salary,remotework
0,51,no,non-travel,2015.72,research & development,6,3,,1,1,0,251.965,3,5,research director,3,unknown,16280.83,42330.17,7,no,13,3.0,3,full time,0,,5,3.0,20,15,15,1972,195370.0,yes
1,52,no,non-travel,2063.39,unknown,1,4,life sciences,2,3,0,257.92375,2,5,manager,3,unknown,16665.833333,43331.17,0,unknown,14,3.0,1,,1,34.0,5,3.0,33,11,9,1971,199990.0,1


**KEEP en BBDD pero no en CSV:**
- numberchildren (todos nan, pero puede ser interesante para la empresa en el futuro)

**DROP Duplicate Rows - `employeenumber` - keep first, reset index**

Duplicados borrados, quardamos solo el primero, reset indice del df.

In [67]:
df = df.drop_duplicates(subset='employeenumber', keep='first').reset_index(drop=True)

In [68]:
df['employeenumber'].duplicated().any()

np.False_

**REPLACE `genders` - 0 = m, 1 = f**

In [69]:
df['gender'] = df['gender'].replace({0: 'm', 1: 'f'})

**FILLNA `standardhours` - NaN = full time**

In [70]:
df['standardhours'] = df['standardhours'].fillna('full time')

**GUARDAR datos limpios a .csv nuevo "raw_data_limpio.csv"**

In [71]:
df.to_csv("raw_data_limpio_2.csv", index=False)

In [72]:
df.shape
# raw_data.csv original: 1678 rows, 41 columns
# raw_data_limpio_2.csv: 1614 rows, 35 columns

(1614, 35)