In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

In [None]:
df = pd.read_csv("FoAI_A2_data_4.6k.csv")
df.head()


In [None]:
df.info()


In [None]:
# Target variable
y = df["salary_in_usd"]

# Selected features
X = df[
    [
        "experience_level",
        "employment_type",
        "job_title",
        "remote_ratio",
        "company_size"
    ]
]


In [None]:
categorical_features = [
    "experience_level",
    "employment_type",
    "job_title",
    "company_size"
]

numeric_features = ["remote_ratio"]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [None]:
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)

model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
rmse


In [None]:
sample_data = pd.DataFrame([
    {
        "experience_level": "EN",
        "employment_type": "FT",
        "job_title": "Data Analyst",
        "remote_ratio": 0,
        "company_size": "S"
    },
    {
        "experience_level": "MI",
        "employment_type": "FT",
        "job_title": "Data Engineer",
        "remote_ratio": 50,
        "company_size": "M"
    },
    {
        "experience_level": "SE",
        "employment_type": "FT",
        "job_title": "Data Scientist",
        "remote_ratio": 100,
        "company_size": "L"
    }
])

predictions = model.predict(sample_data)
predictions
