# Salary Prediction with Linear Regression
We model how **experience**, **education level**, and **skills** influence IT salaries. Linear regression estimates coefficients that show how much the salary changes when a feature increases by one unit. We use **5-Fold Cross-Validation** to obtain reliable accuracy estimates by rotating which fold acts as the validation set.

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, cross_val_score

In [None]:
data = pd.DataFrame({
    "experience_years": [1, 2, 3, 4, 5, 6, 8, 10, 12, 15],
    "education": ["Bachelor", "Bachelor", "Master", "Master", "Master", "PhD", "Bachelor", "Master", "PhD", "PhD"],
    "skills_count": [3, 3, 4, 5, 5, 6, 7, 7, 8, 9],
    "salary_k": [45, 50, 60, 68, 75, 90, 95, 110, 125, 140]
})

X = data[["experience_years", "education", "skills_count"]]
y = data["salary_k"]

In [None]:
categorical_features = ["education"]
numeric_features = ["experience_years", "skills_count"]

preprocess = ColumnTransformer(
    transformers=[("edu", OneHotEncoder(drop="first"), categorical_features)],
    remainder="passthrough",
    verbose_feature_names_out=False
)

pipeline = Pipeline([
    ("prep", preprocess),
    ("model", LinearRegression())
])

kf = KFold(n_splits=5, shuffle=True, random_state=42)
mse_scores = -cross_val_score(pipeline, X, y, cv=kf, scoring="neg_mean_squared_error")
r2_scores = cross_val_score(pipeline, X, y, cv=kf, scoring="r2")

print("Average MSE:", mse_scores.mean())
print("Average R^2:", r2_scores.mean())

In [None]:
pipeline.fit(X, y)
feature_names = pipeline.named_steps["prep"].get_feature_names_out()
print("Feature names after encoding:", feature_names)
print("Coefficients:", pipeline.named_steps["model"].coef_)
print("Intercept:", pipeline.named_steps["model"].intercept_)