In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("FoAI_A2_data_4.6k.csv")
df.head()


Unnamed: 0,work_year,experience_level,employment_type,job_title,salary,salary_currency,salary_in_usd,employee_residence,remote_ratio,company_location,company_size
0,2020,EX,FT,Data Scientist,300000,USD,300000,US,100,US,L
1,2020,MI,FT,Machine Learning Infrastructure Engineer,44000,EUR,50180,PT,0,PT,M
2,2020,MI,FT,Lead Data Analyst,87000,USD,87000,US,100,US,L
3,2020,MI,FT,Data Engineer,88000,GBP,112872,GB,50,GB,L
4,2020,SE,FT,Lead Data Scientist,190000,USD,190000,US,100,US,S


In [3]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4644 entries, 0 to 4643
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   work_year           4644 non-null   int64 
 1   experience_level    4644 non-null   object
 2   employment_type     4644 non-null   object
 3   job_title           4644 non-null   object
 4   salary              4644 non-null   int64 
 5   salary_currency     4644 non-null   object
 6   salary_in_usd       4644 non-null   int64 
 7   employee_residence  4644 non-null   object
 8   remote_ratio        4644 non-null   int64 
 9   company_location    4644 non-null   object
 10  company_size        4644 non-null   object
dtypes: int64(4), object(7)
memory usage: 399.2+ KB


In [4]:
# Target variable
y = df["salary_in_usd"]

# Selected features
X = df[
    [
        "experience_level",
        "employment_type",
        "job_title",
        "remote_ratio",
        "company_size"
    ]
]


In [5]:
categorical_features = [
    "experience_level",
    "employment_type",
    "job_title",
    "company_size"
]

numeric_features = ["remote_ratio"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [7]:
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)

model.fit(X_train, y_train)


0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [8]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse


np.float64(65946.64691600665)

In [9]:
sample_data = pd.DataFrame([
    {
        "experience_level": "EN",
        "employment_type": "FT",
        "job_title": "Data Analyst",
        "remote_ratio": 0,
        "company_size": "S"
    },
    {
        "experience_level": "MI",
        "employment_type": "FT",
        "job_title": "Data Engineer",
        "remote_ratio": 50,
        "company_size": "M"
    },
    {
        "experience_level": "SE",
        "employment_type": "FT",
        "job_title": "Data Scientist",
        "remote_ratio": 100,
        "company_size": "L"
    }
])

predictions = model.predict(sample_data)
predictions


array([ 27582.75246027, 117812.2127311 , 141322.22319354])