### Importaciones de Bibliotecas

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import pylab as plt
import seaborn as sns

### Carga de los datos

In [2]:
salaries = pd.read_csv('data/salaries_data.csv')
testeo = pd.read_csv('data/testeo.csv')

In [3]:
salarios_usd = salaries['salary_in_usd']

In [4]:
df = salaries.drop(['salary','salary_currency','salary_in_usd'], axis=1)

In [5]:
total = pd.concat([df,testeo], axis=0) # Concatenamos los dos dataframes para que tengan los mismos cambios en los datos

## Proceso de Limpieza y transformacion

In [6]:
from sklearn.preprocessing import OrdinalEncoder

# Creamos el codificador indicandole el orden de la variables
encoder = OrdinalEncoder(categories=[['EN','MI','SE','EX']])

# Ajustamos el codificador con la variable experience_level y la transformamos
encoder.fit(total[["experience_level"]])
total["experience_level_encoded"] = encoder.transform(total[["experience_level"]])

In [7]:
total.drop(['experience_level'], axis=1, inplace=True)

In [8]:
# Creamos el codificador indicandole el orden de la variables
encoder = OrdinalEncoder(categories=[['PT','FL','FT','CT']])

# Ajustamos el codificador con la variable experience_level y la transformamos
encoder.fit(total[["employment_type"]])
total["employment_type_encoded"] = encoder.transform(total[["employment_type"]])
total.drop(['employment_type'], axis=1, inplace=True)

In [9]:
total = pd.get_dummies(total, columns=['company_size'])

In [42]:
dictio = {'US': 20934.6 , 'FR': 2582.5,  'GR': 185.5 ,  'LU': 70.9,  'SI': 57.3 , 'DE': 3847.2,  'IN': 2689.9 ,'GB': 2622.4 , 'PK' : 263.9,  'MD' : 4.3,  'JP' : 4872.4, 'CA': 1545.4, 'AS': 0.8,  'IE': 383.4, 'AE': 421.1, 'MX': 1046.6,  'VN' : 341.2, 'BE': 531.2, 'KE': 97.9, 'ES': 1236.3, 'CH': 703.1, 'CL': 224.6,  'CN': 14720.9 , 'DK': 306.3, 'TR': 717.9, 'NZ': 206.7, 'PL': 595.6, 'UA': 153.1, 'AU': 1358.3, 'NG': 448.1, 'EE': 29.2, 'CZ': 257.9, 'AT': 455.5, 'BR': 1363.6, 'DZ': 146.4, 'IR': 439.5, 'NL': 902.3, 'HU': 166.2, 'PT': 229.0, 'HN': 24.7, 'MT': 16.9, 'RO': 236.5, 'SG': 340.4, 'IT': 1595.8, 'HR': 54.9, 'IQ': 171.0, 'IL': 387.2, 'RU': 1481.4, 'CO': 314.4, 'MY': 336.4, 'JE': 5.569, 'BG': 84.06, 'HK' : 369.2, 'PR':106.5,'RS':63.08, 'BO':40.41, 'PH': 394.1,'AR': 487.2,'TN':46.69 }


In [11]:
total.company_location = total.company_location.apply(lambda x: dictio[x])

In [43]:
total.employee_residence = total.employee_residence.apply(lambda x: dictio[x])

In [44]:
def agrupar_titulos(titulo):
    if 'data engineer' in titulo.lower():
        return 'Data Engineer'
    elif 'data scientist' in titulo.lower():
        return 'Data Scientist'
    elif 'data analyst' in titulo.lower():
        return 'Data Analyst'
    elif 'machine learning' or 'ml' in titulo.lower():
        return 'Machine Learning'
    else:
        return titulo

In [45]:
total.job_title = total.job_title.apply(agrupar_titulos)

In [46]:
total = pd.get_dummies(total, columns=['job_title'])
total.head()

Unnamed: 0,work_year,employee_residence,remote_ratio,company_location,experience_level_encoded,employment_type_encoded,company_size_L,company_size_M,company_size_S,job_title_Data Analyst,job_title_Data Engineer,job_title_Data Scientist,job_title_Machine Learning
0,2022,20934.6,100,20934.6,2.0,2.0,0,1,0,0,1,0,0
1,2022,20934.6,100,20934.6,2.0,2.0,0,1,0,0,1,0,0
2,2021,20934.6,100,20934.6,1.0,2.0,0,1,0,1,0,0,0
3,2021,20934.6,100,20934.6,1.0,3.0,1,0,0,0,0,0,1
4,2021,236.5,0,20934.6,1.0,2.0,1,0,0,0,1,0,0


In [47]:
prueba = total.iloc[:500]
testeo = total.iloc[500:]

In [48]:
prueba = prueba.join(salarios_usd)  # uniendo el DataFrame df2 al DataFrame df1

## Separacion de variables para entrenar el modelo

In [49]:
X = prueba

y = salarios_usd 

In [51]:
# train_test_split

from sklearn.model_selection import train_test_split as tts  # el alias es cosa mia

X_train, X_test, y_train, y_test  = tts(X, y, train_size=0.8, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((400, 14), (100, 14), (400,), (100,))

### H20 como modelo de ML para entrenar y testear

In [52]:
import h2o

from h2o.automl import H2OAutoML

In [53]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from C:\Users\Carolina\AppData\Local\Programs\Python\Python310\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Carolina\AppData\Local\Temp\tmpbmx3w7y6
  JVM stdout: C:\Users\Carolina\AppData\Local\Temp\tmpbmx3w7y6\h2o_Carolina_started_from_python.out
  JVM stderr: C:\Users\Carolina\AppData\Local\Temp\tmpbmx3w7y6\h2o_Carolina_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,10 secs
H2O_cluster_timezone:,Europe/Paris
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_Carolina_obopkr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,420.9 Mb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [54]:
h2train=h2o.H2OFrame(prueba)


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [55]:
h2test=h2o.H2OFrame(testeo)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [56]:
X=[c for c in h2train.columns if c!='salary_in_usd']
y = 'salary_in_usd'

In [57]:
# inicia auto-machine-learning

automl=H2OAutoML(max_models=20,
                 seed=42,   # random_state
                 max_runtime_secs=300,
                 sort_metric='RMSE')


In [58]:
# entrena

automl.train(x=X,
             y=y,
             training_frame=h2train)

AutoML progress: |
11:09:10.370: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,40.0,40.0,13794.0,7.0,7.0,7.0,19.0,27.0,22.725

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
mae,33259.598,3430.7266,35879.094,31026.629,28762.818,33503.164,37126.285
mean_residual_deviance,2378940420.0,856999680.0,3405194500.0,1560092420.0,1508929410.0,2365538820.0,3054947330.0
mse,2378940420.0,856999680.0,3405194500.0,1560092420.0,1508929410.0,2365538820.0,3054947330.0
r2,0.5126674,0.0943661,0.4059536,0.6291644,0.5326578,0.5678436,0.4277178
residual_deviance,2378940420.0,856999680.0,3405194500.0,1560092420.0,1508929410.0,2365538820.0,3054947330.0
rmse,48121.074,8895.411,58354.043,39498.004,38844.94,48636.805,55271.58
rmsle,0.5048686,0.0705851,0.4253705,0.5902686,0.4454094,0.5057022,0.5575925

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2023-03-07 11:09:20,0.958 sec,0.0,69544.1020082,52000.59792,4836382124.133342
,2023-03-07 11:09:20,0.978 sec,5.0,55523.7786046,39483.5811406,3082889990.5332613
,2023-03-07 11:09:20,1.003 sec,10.0,48592.2322949,33445.8396172,2361205039.404744
,2023-03-07 11:09:20,1.027 sec,15.0,45331.5804425,31020.0409531,2054952185.4191496
,2023-03-07 11:09:20,1.045 sec,20.0,43485.1493241,29491.860625,1890958211.7361453
,2023-03-07 11:09:20,1.061 sec,25.0,42403.8942458,28660.3283984,1798090247.2105892
,2023-03-07 11:09:20,1.082 sec,30.0,41455.0445361,27928.7258711,1718520717.4934838
,2023-03-07 11:09:20,1.100 sec,35.0,40801.793849,27458.2525684,1664786381.2968922
,2023-03-07 11:09:20,1.119 sec,40.0,40364.2702735,27028.4353691,1629274314.7104878

variable,relative_importance,scaled_importance,percentage
employee_residence,3189853913088.0,1.0,0.3820486
company_location,1630948163584.0,0.5112924,0.1953385
experience_level_encoded,1444751081472.0,0.4529208,0.1730377
job_title_Machine Learning,356635967488.0,0.1118032,0.0427143
company_size_L,309436252160.0,0.0970064,0.0370612
job_title_Data Analyst,306839126016.0,0.0961922,0.0367501
remote_ratio,305856217088.0,0.0958841,0.0366324
work_year,296106590208.0,0.0928276,0.0354647
job_title_Data Engineer,168821014528.0,0.0529244,0.0202197
job_title_Data Scientist,146188107776.0,0.0458291,0.0175089


In [59]:
# prediciones del lider

y_pred=automl.leader.predict(h2test)

gbm prediction progress: |███████████████████████████████████████████████████████| (done) 100%


In [60]:
muestra = pd.read_csv('data/muestra.csv')
muestra

Unnamed: 0,id,salary_in_usd
0,0,26352
1,1,40574
2,2,68759
3,3,74733
4,4,32611
...,...,...
102,102,83373
103,103,57827
104,104,31053
105,105,39725


### Subir los resultados a un .CSV

In [61]:
pred_df = y_pred.as_data_frame()

In [62]:
muestra['salary_in_usd'] = pred_df 

In [63]:
muestra.to_csv('data/hsegundointento.csv', index=False)