# Transformations

In [1]:
import pandas as pd
import numpy as np
import sys
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer, MinMaxScaler, OneHotEncoder
import pickle
import yaml

In [2]:
pd.set_option('display.max_columns', 35)

with open("../params.yaml", "r") as file:
    config = yaml.safe_load(file)

sys.path.insert(0, os.path.abspath(config['src']))
import functions as fn

In [3]:
data=pd.read_csv(config['data']['cleaned']+'data_cleaned_nd_and_grouped.csv')
data

Unnamed: 0,company,job_title,location,job_description,company_size,company_type,company_sector,company_industry,company_revenue,job_simpl,seniority,state,salary_estimate,company_founded,hourly,rating,python_yn,spark_yn,azure_yn,aws_yn,excel_yn,machine_learning_yn,description_len,company_age,size,type,sector,revenue,zone
0,Microsoft,Data & Applied Scientist,"Redmond, WA",Microsoft 365 is a key part of the company’s c...,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,$10+ billion (USD),data scientist,junior,WA,123486.0,1975.0,0.0,4.40,1.0,0.0,0.0,1.0,0.0,1.0,359.0,47.0,Huge company,Company - Public,IT and Telecommunications,$10+ billion (USD),Western
1,UT Southwestern Medical Center,Data Scientist or Bioinformatician (remote),Remote,Center Information:\nThe Quantitative Biomedic...,10000+ Employees,Hospital,Healthcare,Health Care Services & Hospitals,$1 to $5 billion (USD),data scientist,mid,Remote,93500.0,1943.0,0.0,4.00,1.0,0.0,0.0,0.0,0.0,1.0,267.0,79.0,Huge company,Other institutions,Healthcare,$1 to $10 billion (USD),Remote
2,Notion,"Data Scientist, Growth","New York, NY",About Us:\nWe're on a mission to make it possi...,201 to 500 Employees,Company - Private,Information Technology,Enterprise Software & Network Solutions,Unknown / Non-Applicable,data scientist,Senior,NY,137853.0,2016.0,0.0,4.90,1.0,0.0,0.0,0.0,0.0,0.0,589.0,6.0,Medium-sized company,Company - Private,IT and Telecommunications,Unknown / Non-Applicable,Northeastern
3,Net2Aspire,Jr. Data Scientist,Remote,? Apply Statistical and Machine Learning metho...,Unknown,Company - Public,Unknown,Unknown,Unknown / Non-Applicable,data scientist,junior,Remote,72500.0,1950.0,0.0,3.94,0.0,0.0,0.0,0.0,0.0,1.0,132.0,72.0,Unknown size,Company - Public,Unknown,Unknown / Non-Applicable,Remote
4,Ntropy Network,Data Scientist,Remote,"Over the last few decades, technological innov...",1 to 50 Employees,Company - Private,Unknown,Unknown,Unknown / Non-Applicable,data scientist,mid,Remote,155000.0,2011.0,0.0,4.07,1.0,0.0,0.0,1.0,0.0,0.0,522.0,11.0,Small company,Company - Private,Unknown,Unknown / Non-Applicable,Remote
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,CVS Health,"Senior Machine Learning Engineer ( Python , ML...",Connecticut,Analytics & Behavior Change is an innovation e...,10000+ Employees,Company - Public,Healthcare,Health Care Services & Hospitals,$10+ billion (USD),machine learning engineer,Senior,CT,135000.0,1963.0,0.0,3.10,1.0,0.0,0.0,1.0,0.0,1.0,556.0,59.0,Huge company,Company - Public,Healthcare,$10+ billion (USD),Northeastern
743,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,$10+ billion (USD),machine learning engineer,Senior,NY,143796.0,1935.0,0.0,4.00,1.0,0.0,0.0,0.0,1.0,1.0,356.0,87.0,Huge company,Company - Public,Financial and housing services,$10+ billion (USD),Northeastern
744,MIT Lincoln Laboratory,Machine Learning Software Developer,"Lexington, MA",Laboratory Description\nMIT Lincoln Laboratory...,1001 to 5000 Employees,Nonprofit Organization,Aerospace & Defense,Aerospace & Defense,Unknown / Non-Applicable,machine learning engineer,mid,MA,117724.0,1951.0,0.0,4.30,1.0,0.0,0.0,0.0,0.0,1.0,288.0,71.0,Large company,Other institutions,Other sectors,Unknown / Non-Applicable,Northeastern
745,Morgan Stanley,Machine Learning Researcher,"New York, NY",Machine Learning Researcher\nJob Number:\n3227...,10000+ Employees,Company - Public,Financial Services,Investment & Asset Management,$10+ billion (USD),machine learning engineer,Senior,NY,143796.0,1935.0,0.0,4.00,1.0,0.0,0.0,0.0,1.0,1.0,356.0,87.0,Huge company,Company - Public,Financial and housing services,$10+ billion (USD),Northeastern


In [4]:
data.columns

Index(['company', 'job_title', 'location', 'job_description', 'company_size',
       'company_type', 'company_sector', 'company_industry', 'company_revenue',
       'job_simpl', 'seniority', 'state', 'salary_estimate', 'company_founded',
       'hourly', 'rating', 'python_yn', 'spark_yn', 'azure_yn', 'aws_yn',
       'excel_yn', 'machine_learning_yn', 'description_len', 'company_age',
       'size', 'type', 'sector', 'revenue', 'zone'],
      dtype='object')

**We select the columns which we have reduced cardinality, they have no multicollinearity and make sense to use**

In [5]:
selected = data[['salary_estimate','job_simpl','seniority','company_age','size','type','sector','revenue', 
          'zone','python_yn','spark_yn','azure_yn','aws_yn','excel_yn','machine_learning_yn']]

## Remove outliers from numericals

In [6]:
selected_num, selected_cat = fn.num_cat_splitter(selected)

In [7]:
selected_num_no_out=fn.outlier_remover(selected_num)

In [8]:
print(selected_cat.shape)
print(selected_num_no_out.shape)

(747, 7)
(726, 8)


In [9]:
df_no_out = pd.concat([selected_cat, selected_num_no_out], axis=1)
df_no_out = df_no_out.dropna().reset_index(drop=True)
df_no_out

Unnamed: 0,job_simpl,seniority,size,type,sector,revenue,zone,salary_estimate,company_age,python_yn,spark_yn,azure_yn,aws_yn,excel_yn,machine_learning_yn
0,data scientist,junior,Huge company,Company - Public,IT and Telecommunications,$10+ billion (USD),Western,123486.0,47.0,1.0,0.0,0.0,1.0,0.0,1.0
1,data scientist,mid,Huge company,Other institutions,Healthcare,$1 to $10 billion (USD),Remote,93500.0,79.0,1.0,0.0,0.0,0.0,0.0,1.0
2,data scientist,Senior,Medium-sized company,Company - Private,IT and Telecommunications,Unknown / Non-Applicable,Northeastern,137853.0,6.0,1.0,0.0,0.0,0.0,0.0,0.0
3,data scientist,junior,Unknown size,Company - Public,Unknown,Unknown / Non-Applicable,Remote,72500.0,72.0,0.0,0.0,0.0,0.0,0.0,1.0
4,data scientist,mid,Small company,Company - Private,Unknown,Unknown / Non-Applicable,Remote,155000.0,11.0,1.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
721,machine learning engineer,Senior,Huge company,Company - Private,Industry,$10+ billion (USD),Northeastern,135000.0,59.0,1.0,0.0,0.0,1.0,0.0,1.0
722,data scientist,Senior,Huge company,Company - Public,Healthcare,$10+ billion (USD),Northeastern,143796.0,87.0,1.0,0.0,0.0,0.0,1.0,1.0
723,machine learning engineer,Senior,Medium-sized company,Company - Private,IT and Telecommunications,$5 to $100 million (USD),Western,117724.0,71.0,1.0,0.0,0.0,0.0,0.0,1.0
724,machine learning engineer,junior,Huge company,Other institutions,Financial and housing services,$10+ billion (USD),Southern,143796.0,87.0,1.0,0.0,0.0,0.0,1.0,1.0


In [10]:
df_no_out.to_csv(config['data']['cleaned']+'data_selected_no_outliers.csv' , index=False)

## X-Y split

In [11]:
y = df_no_out['salary_estimate']
X = df_no_out.drop('salary_estimate', axis=1)

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=config['x-y-split']['test_size'], random_state=config['x-y-split']['random_state'])

**We save our Y to use it in the Jupyter Notebook of models**

In [13]:
y_test.to_csv(config['data']['cleaned']+'y_test.csv', index=False)
y_train.to_csv(config['data']['cleaned']+'y_train.csv', index=False)

In [14]:
X_train_num, X_train_cat = fn.num_cat_splitter(X_train)
X_test_num, X_test_cat = fn.num_cat_splitter(X_test)

## Power Transformer

In [15]:
transformer = PowerTransformer().fit(X_train_num)

X_train_num_trans = transformer.transform(X_train_num)
X_test_num_trans = transformer.transform(X_test_num)

X_train_num_trans_df = pd.DataFrame(X_train_num_trans, columns=X_train_num.columns)
X_test_num_trans_df = pd.DataFrame(X_test_num_trans, columns=X_test_num.columns)

with open(config['transformers']+'power_transformer.pkl', 'wb') as file:
    pickle.dump(transformer, file)

## MinMax Scaler

In [16]:
scaler = MinMaxScaler().fit(X_train_num_trans_df)

X_train_num_scal = scaler.transform(X_train_num_trans_df)
X_test_num_scal = scaler.transform(X_test_num_trans_df)

X_train_scaled = pd.DataFrame(X_train_num_scal, columns=X_train_num_trans_df.columns)
X_test_scaled = pd.DataFrame(X_test_num_scal, columns=X_test_num_trans_df.columns)

with open(config['scalers']+'minmax_scaler.pkl', 'wb') as file:
    pickle.dump(scaler, file)

## One Hot Encoder

In [17]:
encoder = OneHotEncoder(drop='first').fit(X_train_cat)

X_train_cat_np = encoder.transform(X_train_cat).toarray()
X_test_cat_np = encoder.transform(X_test_cat).toarray()

X_train_cat_encoded = pd.DataFrame(X_train_cat_np, columns=encoder.get_feature_names_out())
X_test_cat_encoded = pd.DataFrame(X_test_cat_np, columns=encoder.get_feature_names_out())

with open(config['encoders']+'onehot_encoder.pkl', 'wb') as file:
    pickle.dump(encoder, file)

**We save our X_train and X_test after apllying the transformations**

In [18]:
X_train_transformed = pd.concat([X_train_scaled, X_train_cat_encoded], axis=1)
X_train_transformed.head()
X_train_transformed.to_csv(config['data']['cleaned']+'X_train_transformed.csv', index=False)

In [19]:
X_test_transformed = pd.concat([X_test_scaled, X_test_cat_encoded], axis=1)
X_test_transformed.head()
X_test_transformed.to_csv(config['data']['cleaned']+'X_test_transformed.csv', index=False)