In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

import pickle
import joblib

# Cargamos el dataset y modificamos la columna de Education Level

In [2]:
dataset = pd.read_csv('dataset/Salary_Data.csv')
dataset = dataset.dropna()

In [3]:
dataset["Education Level"] = dataset["Education Level"].str.lower()
dataset["Education Level"] = dataset["Education Level"].map({
    "bachelor's degree": "Grado", #Grado
    "master's degree": "Master", #Master
    "phd": "Doctorado", #Doctorado
    "bachelor's": "Grado", #Grado
    "high school": "Bachillerato", #Escuela secundaria / Bachillerato
    "master's": "Master", #Master   
}).astype(str)

In [4]:
X = dataset[['Education Level', 'Job Title', 'Age', 'Years of Experience']]
y = dataset['Salary']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## OneHotEnconder
OneHotEncoder convierte variables categóricas en una representación numérica binaria (1 o 0) para su uso en algoritmos de aprendizaje automático.

In [5]:
preprocessor = ColumnTransformer(
    transformers=[
        ('edu_job', OneHotEncoder(handle_unknown='ignore'), ['Education Level', 'Job Title'])
    ],
    remainder='passthrough'
)

model = RandomForestRegressor(n_estimators=100, random_state=42)

pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', model)
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_test)

joblib.dump(pipeline, 'salary_prediction_model.joblib')

['salary_prediction_model.joblib']

## Revisar prediiccion

In [6]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 78014469.070767
R^2 Score: 0.9726405106051327


## Pruebas de Prediccion

In [7]:
new_data = pd.DataFrame([X_train.iloc[0]])

print(X_train.iloc[0])

prediction = pipeline.predict(new_data)

print(f'Predicted Salary: {prediction[0]}')

Education Level                      Master
Job Title              Senior HR Generalist
Age                                    39.0
Years of Experience                    10.0
Name: 3426, dtype: object
Predicted Salary: 105000.0


In [8]:

new_data = pd.DataFrame({
    'Education Level': ['Bachillerato'],
    'Job Title': ['Software Engineer'],
    'Age': [29],
    'Years of Experience': [7]
})

# Utilizar el pipeline para predecir el salario
predicted_salary = pipeline.predict(new_data)

# Imprimir el resultado
print(f'Predicted Salary: {predicted_salary}')


Predicted Salary: [164313.82183908]


# Crear Pipeline Pickle


In [9]:
with open('pickle/pipeline.pickle','wb') as f: #Abrimos archivo en modo escritura binaria
    pickle.dump(pipeline, f) 

## Prueba del Pickle

In [10]:
with open('pickle/pipeline.pickle','rb') as f:
    loaded_pipe = pickle.load(f)
#comprobamos realizando una nueva predicción con el pipeline
prediction = loaded_pipe.predict(new_data)
prediction

array([164313.82183908])

# Exportar titulos trabajo

In [11]:
dataset.groupby('Job Title').size()


Job Title
Account Manager               1
Accountant                    1
Administrative Assistant      2
Back end Developer          244
Business Analyst              2
                           ... 
UX Designer                   1
UX Researcher                 1
VP of Finance                 1
VP of Operations              1
Web Developer                87
Length: 191, dtype: int64

In [12]:
result_groupby = dataset.groupby('Job Title').size()

# Guarda el resultado en un archivo pickle
with open('pickle/jobs.pickle', 'wb') as f:
    pickle.dump(result_groupby, f)