In [118]:
import pandas as pd

In [119]:
people = pd.read_csv("../data/raw/people.csv")
salary = pd.read_csv("../data/raw/salary.csv")
descriptions = pd.read_csv("../data/raw/descriptions.csv")

In [120]:
people.head()

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience
0,0,32.0,Male,Bachelor's,Software Engineer,5.0
1,1,28.0,Female,Master's,Data Analyst,3.0
2,2,45.0,Male,PhD,Senior Manager,15.0
3,3,36.0,Female,Bachelor's,Sales Associate,7.0
4,4,52.0,Male,Master's,Director,20.0


In [121]:
salary.head()

Unnamed: 0,id,Salary
0,0,90000.0
1,1,65000.0
2,2,150000.0
3,3,60000.0
4,4,200000.0


In [122]:
descriptions.head()

Unnamed: 0,id,Description
0,0,I am a 32-year-old male working as a Software ...
1,1,I am a 28-year-old data analyst with a Master'...
2,2,I am a 45-year-old Senior Manager with a PhD a...
3,3,I am a 36-year-old female Sales Associate with...
4,4,I am a 52-year-old male with over two decades ...


In [123]:
df = (
    people
    .merge(salary, on="id", how="left")
    .merge(descriptions, on="id", how="left")
)
df.head()

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...


### Valores nulos

In [124]:
df.isnull().sum()

id                     0
Age                    5
Gender                 5
Education Level        5
Job Title              5
Years of Experience    2
Salary                 2
Description            3
dtype: int64

In [125]:
df["Description"].fillna("no-description", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Description"].fillna("no-description", inplace=True)


In [126]:
df[df.isnull().any(axis=1)]

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description
51,51,33.0,Male,Master's,,7.0,85000.0,I am a 33-year-old Business Intelligence Analy...
60,60,51.0,Female,Master's,,23.0,170000.0,I am a 51-year-old female with a Master's degr...
139,139,43.0,Female,,Senior Product Marketing Manager,14.0,120000.0,I am a 43-year-old Senior Product Marketing Ma...
172,172,,,,,,,"As an employee, I bring a wealth of diverse ex..."
219,219,40.0,,Bachelor's,Senior Sales Representative,12.0,100000.0,I am a 40-year-old Senior Sales Representative...
221,221,,Female,Bachelor's,Junior Social Media Specialist,3.0,45000.0,I am a 31-year-old female currently working as...
225,225,40.0,,Bachelor's,Senior Marketing Manager,11.0,105000.0,I am a 40-year-old Senior Marketing Manager wi...
235,235,32.0,,Bachelor's,Junior Sales Representative,3.0,45000.0,As a 32-year-old Junior Sales Representative w...
260,260,,,,,,,"As an employee, I bring a unique blend of skil..."
261,261,37.0,Female,,Senior Financial Manager,10.0,120000.0,I am a 37-year-old Senior Financial Manager wi...


In [127]:
import re
def extract_capital_words(s):
    return " ".join(re.findall(r'\b[A-Z][a-zA-Z]*\b', s))

# 1️⃣ Eliminar primera palabra si empieza con mayúscula
df["cleaned"] = df["Description"].str.replace(r'^[A-Z][a-zA-Z]*\b\s*', '', regex=True)
# 2️⃣ Eliminar palabras después de un punto
df["cleaned"] = df["cleaned"].str.replace(r'\.\s*\b[A-Z][a-zA-Z]*\b', '.', regex=True)

# Extrar palabras con mayuscula
df["capital_words"] = df["cleaned"].apply(extract_capital_words)

df["capital_words"] = df["capital_words"].replace(r"\b(My|I)\b", "", regex=True)
df.head()

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...,am a 32-year-old male working as a Software En...,Software Engineer Bachelor Computer Science P...
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...,am a 28-year-old data analyst with a Master's ...,Master
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...,am a 45-year-old Senior Manager with a PhD and...,Senior Manager PhD
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...,am a 36-year-old female Sales Associate with a...,Sales Associate Bachelor
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...,am a 52-year-old male with over two decades of...,Director Master


In [128]:
df["capital_words"].to_list()

['Software Engineer Bachelor Computer Science  Python Java C  Agile DevOps',
 'Master ',
 'Senior Manager PhD  ',
 'Sales Associate Bachelor',
 'Director Master ',
 'Marketing Analyst Bachelor',
 'Master Product Manager',
 'Sales Manager Bachelor  ',
 'Marketing Coordinator Bachelor ',
 'Senior Scientist PhD ',
 'Software Developer Master Computer Science Java Python JavaScript  ',
 'HR Manager Bachelor HR',
 'Financial Analyst Bachelor Finance ',
 'Master Project Manager',
 'Customer Service Representative Bachelor  ',
 'Bachelor Operations Manager',
 'Marketing Manager Master',
 'Senior Engineer PhD',
 'Data Entry Clerk Bachelor  ',
 'Sales Director Bachelor  ',
 'Business Analyst Master',
 'Master Vice President Operations ',
 'IT Support Specialist Bachelor  IT',
 'Recruiter Bachelor',
 'Master Finance Financial Manager',
 'Social Media Specialist Bachelor',
 'Master Computer Science Software Manager ',
 'Junior Developer Bachelor  ',
 'Senior Consultant PhD PhD ',
 'Master Design 

In [129]:
df["Age_New"] = df["Description"].str.extract(r'(\d+)(?:-?year-old| years old)')[0].astype(float)
df["Age_New"] = df["Age_New"].fillna(df["Age"])
df.isnull().sum()

id                     0
Age                    5
Gender                 5
Education Level        5
Job Title              5
Years of Experience    2
Salary                 2
Description            0
cleaned                0
capital_words          0
Age_New                3
dtype: int64

In [130]:
df.head()

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...,am a 32-year-old male working as a Software En...,Software Engineer Bachelor Computer Science P...,32.0
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...,am a 28-year-old data analyst with a Master's ...,Master,28.0
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...,am a 45-year-old Senior Manager with a PhD and...,Senior Manager PhD,45.0
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...,am a 36-year-old female Sales Associate with a...,Sales Associate Bachelor,36.0
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...,am a 52-year-old male with over two decades of...,Director Master,52.0


In [131]:
list(df["Education Level"].unique())[:-1]

["Bachelor's", "Master's", 'PhD']

In [132]:
# df[df["Gender"].isnull()]
df[df["Education Level"].isnull()]

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New
139,139,43.0,Female,,Senior Product Marketing Manager,14.0,120000.0,I am a 43-year-old Senior Product Marketing Ma...,am a 43-year-old Senior Product Marketing Mana...,Senior Product Marketing Manager Master,43.0
172,172,,,,,,,"As an employee, I bring a wealth of diverse ex...","an employee, I bring a wealth of diverse exper...",,
260,260,,,,,,,"As an employee, I bring a unique blend of skil...","an employee, I bring a unique blend of skills ...",,
261,261,37.0,Female,,Senior Financial Manager,10.0,120000.0,I am a 37-year-old Senior Financial Manager wi...,am a 37-year-old Senior Financial Manager with...,Senior Financial Manager Bachelor,37.0
366,366,31.0,Female,,Junior Financial Analyst,3.0,50000.0,I am a 31-year-old female working as a Junior ...,am a 31-year-old female working as a Junior Fi...,Junior Financial Analyst Bachelor,31.0


In [133]:
levels = ["Bachelor's", "Master's", "PhD"]

# Crear un patrón regex a partir de la lista
pattern = r'(' + '|'.join(map(re.escape, levels)) + r')'

# Buscar y asignar el valor encontrado
df["New_Education_Level"] = df["Description"].str.extract(pattern, expand=False)
df["New_Education_Level"] = df["New_Education_Level"].fillna(df["Education Level"])
df.isnull().sum()

id                     0
Age                    5
Gender                 5
Education Level        5
Job Title              5
Years of Experience    2
Salary                 2
Description            0
cleaned                0
capital_words          0
Age_New                3
New_Education_Level    2
dtype: int64

In [134]:
# df[df["Job Title"].isnull()]
# # df.loc[60, "capital_words"]
levels = r"(Bachelor|Master|PhD)"

def split_job_edu(s):
    match = re.search(levels, s)
    if not match:
        return None, None  # sin educación → todo es Job_Title
    
    edu = match.group(1)
    start, end = match.span()
    
    # Si el nivel educativo está al inicio
    if start == 0:
        job = s[end:].strip()
    else:
        job = s[:start].strip()
    
    return edu, job

df[["New_Education_Level", "New_Job_Title"]] = df["capital_words"].apply(
    lambda x: pd.Series(split_job_edu(x))
)
df.head()

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New,New_Education_Level,New_Job_Title
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...,am a 32-year-old male working as a Software En...,Software Engineer Bachelor Computer Science P...,32.0,Bachelor,Software Engineer
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...,am a 28-year-old data analyst with a Master's ...,Master,28.0,Master,
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...,am a 45-year-old Senior Manager with a PhD and...,Senior Manager PhD,45.0,PhD,Senior Manager
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...,am a 36-year-old female Sales Associate with a...,Sales Associate Bachelor,36.0,Bachelor,Sales Associate
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...,am a 52-year-old male with over two decades of...,Director Master,52.0,Master,Director


In [136]:
df.iloc[[51, 60, 172, 260, 332]]

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New,New_Education_Level,New_Job_Title
51,51,33.0,Male,Master's,,7.0,85000.0,I am a 33-year-old Business Intelligence Analy...,am a 33-year-old Business Intelligence Analyst...,Business Intelligence Analyst Master BI SQL Ta...,33.0,Master,Business Intelligence Analyst
60,60,51.0,Female,Master's,,23.0,170000.0,I am a 51-year-old female with a Master's degr...,am a 51-year-old female with a Master's degree...,Master Director Operations,51.0,Master,Director Operations
172,172,,,,,,,"As an employee, I bring a wealth of diverse ex...","an employee, I bring a wealth of diverse exper...",,,,
260,260,,,,,,,"As an employee, I bring a unique blend of skil...","an employee, I bring a unique blend of skills ...",,,,
332,332,45.0,Female,PhD,,16.0,160000.0,I am a 45-year-old Senior UX Designer with a P...,am a 45-year-old Senior UX Designer with a PhD...,Senior UX Designer PhD,45.0,PhD,Senior UX Designer


In [143]:
import numpy as np

df["New_Job_Title"] = np.where(
    df["Job Title"].notna(),
    df["Job Title"],   # si difiere, lo dejamos
    df["New_Job_Title"]        # si es igual o NaN → usamos Job Title
)
df

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New,New_Education_Level,New_Job_Title
0,0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,I am a 32-year-old male working as a Software ...,am a 32-year-old male working as a Software En...,Software Engineer Bachelor Computer Science P...,32.0,Bachelor,Software Engineer
1,1,28.0,Female,Master's,Data Analyst,3.0,65000.0,I am a 28-year-old data analyst with a Master'...,am a 28-year-old data analyst with a Master's ...,Master,28.0,Master,Data Analyst
2,2,45.0,Male,PhD,Senior Manager,15.0,150000.0,I am a 45-year-old Senior Manager with a PhD a...,am a 45-year-old Senior Manager with a PhD and...,Senior Manager PhD,45.0,PhD,Senior Manager
3,3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,I am a 36-year-old female Sales Associate with...,am a 36-year-old female Sales Associate with a...,Sales Associate Bachelor,36.0,Bachelor,Sales Associate
4,4,52.0,Male,Master's,Director,20.0,200000.0,I am a 52-year-old male with over two decades ...,am a 52-year-old male with over two decades of...,Director Master,52.0,Master,Director
...,...,...,...,...,...,...,...,...,...,...,...,...,...
370,370,35.0,Female,Bachelor's,Senior Marketing Analyst,8.0,85000.0,As a 35-year-old Senior Marketing Analyst with...,a 35-year-old Senior Marketing Analyst with a ...,Senior Marketing Analyst Bachelor,35.0,Bachelor,Senior Marketing Analyst
371,371,43.0,Male,Master's,Director of Operations,19.0,170000.0,I am a 43-year-old male with a Master's degree...,am a 43-year-old male with a Master's degree i...,Master Business Administration Director Operat...,43.0,Master,Director of Operations
372,372,29.0,Female,Bachelor's,Junior Project Manager,2.0,40000.0,As a 29-year-old female Junior Project Manager...,a 29-year-old female Junior Project Manager wi...,Junior Project Manager Bachelor,29.0,Bachelor,Junior Project Manager
373,373,34.0,Male,Bachelor's,Senior Operations Coordinator,7.0,90000.0,As a Senior Operations Coordinator with a Bach...,a Senior Operations Coordinator with a Bachelo...,Senior Operations Coordinator Bachelor,34.0,Bachelor,Senior Operations Coordinator


In [144]:
df[df["New_Job_Title"] != df["Job Title"]]

Unnamed: 0,id,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Description,cleaned,capital_words,Age_New,New_Education_Level,New_Job_Title
51,51,33.0,Male,Master's,,7.0,85000.0,I am a 33-year-old Business Intelligence Analy...,am a 33-year-old Business Intelligence Analyst...,Business Intelligence Analyst Master BI SQL Ta...,33.0,Master,Business Intelligence Analyst
60,60,51.0,Female,Master's,,23.0,170000.0,I am a 51-year-old female with a Master's degr...,am a 51-year-old female with a Master's degree...,Master Director Operations,51.0,Master,Director Operations
172,172,,,,,,,"As an employee, I bring a wealth of diverse ex...","an employee, I bring a wealth of diverse exper...",,,,
260,260,,,,,,,"As an employee, I bring a unique blend of skil...","an employee, I bring a unique blend of skills ...",,,,
332,332,45.0,Female,PhD,,16.0,160000.0,I am a 45-year-old Senior UX Designer with a P...,am a 45-year-old Senior UX Designer with a PhD...,Senior UX Designer PhD,45.0,PhD,Senior UX Designer
