In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

In [None]:
jobs_df = pd.read_csv('data.csv')

In [None]:
jobs_df.head(5)

In [None]:
len(jobs_df)

In [None]:
jobs_df.info()

In [None]:
jobs_df.describe().astype(int)

In [None]:
jobs_df.isna().sum()

In [None]:
jobs_df.corr(numeric_only=True)

In [None]:
jobs_df.hist(bins=100, figsize=(16, 8))
plt.show()

In [None]:
categorical_features = ['company size', 'location', 'technology', 'seniority']

In [None]:
x_labels = ['rozmiar firmy', 'lokalizacja', 'technologia', 'doświadczenie']
titles = ['rozmiariu firmy', 'lokalizacji', 'technologii', 'doświadczenia']

plt.tight_layout()
fig, ax = plt.subplots(2, 2, figsize=(14, 12))
ax = ax.flatten()
for i, feature_name in enumerate(categorical_features):

    # for tick in ax[i].get_xticklabels():
    #     tick.set_rotation(10)
    bars1 = ax[i].barh([str(x) for x in jobs_df[feature_name].value_counts().sort_values().index], jobs_df[feature_name].value_counts().sort_values().values)
    ax[i].set_title(f'Liczba ofert w zależności od {titles[i]}')
    ax[i].set_ylabel(x_labels[i])
    ax[i].set_xlabel('Liczba ofert')
    ax[i].bar_label(bars1)

plt.subplots_adjust(hspace=0.3)
plt.subplots_adjust(wspace=0.3)
plt.show()

In [None]:
jobs_df['company'].value_counts()

In [None]:
jobs_both_salary_df = jobs_df[~jobs_df[['salary b2b min', 'salary b2b max', 'salary employment min', 'salary employment max']].isna().any(axis=1)]

In [None]:
employment_b2b_rate_df = (((jobs_both_salary_df['salary b2b min'] + jobs_both_salary_df['salary b2b max']) / 2) / ((jobs_both_salary_df['salary employment min'] + jobs_both_salary_df['salary employment max']) / 2))
employment_b2b_rate_mean = employment_b2b_rate_df.mean()
employment_b2b_rate_std = employment_b2b_rate_df.std()
print(employment_b2b_rate_mean, employment_b2b_rate_std)

In [None]:
jobs_df['salary'] = 0
for i, row in jobs_df.iterrows():
    if row.isna()['salary b2b min']:
        jobs_df.loc[i, 'salary']  = ((row['salary employment min'] + row['salary employment max']) / 2) * employment_b2b_rate_mean
    else:
        jobs_df.loc[i, 'salary']  = (row['salary b2b min'] + row['salary b2b max']) / 2

In [None]:
jobs_df

In [None]:
jobs_df.corr()

In [None]:
jobs_df[['company size', 'year', 'month', 'salary']].corr()['salary']

In [None]:
target = 'salary'

In [None]:
x_labels = ['rozmiar firmy', 'lokalizacja', 'technologia', 'doświadczenie']
titles = ['rozmiariu firmy', 'lokalizacji', 'technologii', 'doświadczenia']

plt.tight_layout()
fig, ax = plt.subplots(2, 2, figsize=(14, 12))
ax = ax.flatten()
for i, feature_name in enumerate(categorical_features):
    bars1 = ax[i].barh([str(x) for x in jobs_df.groupby(feature_name)['salary'].mean().sort_values().index], jobs_df.groupby(feature_name)['salary'].mean().sort_values().values, xerr=jobs_df.groupby(feature_name)['salary'].std().sort_values().values)
    ax[i].set_title(f'Średnie zarobki w zależności od {titles[i]}')
    ax[i].set_ylabel(x_labels[i])
    ax[i].set_xlabel('Średnie zarobki')
    ax[i].bar_label(bars1)

plt.subplots_adjust(hspace=0.3)
plt.subplots_adjust(wspace=0.3)
plt.show()

In [None]:
plt.tight_layout()
fig, ax = plt.subplots(2, 2, figsize=(14, 12))
ax = ax.flatten()

for i, feature_name in enumerate(categorical_features):
    # Prepare data for box plot
    labels = jobs_df.groupby(feature_name)['salary'].mean().sort_values().index
    data_to_plot = [jobs_df[jobs_df[feature_name] == category]['salary'] for category in labels]
    
    # Create box plot
    ax[i].boxplot(data_to_plot, vert=False, patch_artist=True, labels=labels)
    ax[i].set_title(f'Rozkład zarobków w zależności od {titles[i]}')
    ax[i].set_ylabel(x_labels[i])
    ax[i].set_xlabel('Zarobki')

plt.subplots_adjust(hspace=0.3)
plt.subplots_adjust(wspace=0.3)
plt.show()

In [None]:
plt.tight_layout()
fig, ax = plt.subplots(2, 2, figsize=(14, 12))
ax = ax.flatten()

for i, feature_name in enumerate(categorical_features):
    # Prepare data for box plot
    labels = jobs_df.groupby(feature_name)['salary'].mean().sort_values().index
    data_to_plot = [jobs_df[jobs_df[feature_name] == category]['salary'] for category in labels]
    
    # Create box plot
    r = ax[i].violinplot(data_to_plot, vert=False, showmedians=True, showmeans=True)
    r['cmeans'].set_color('g')
    r['cmedians'].set_color('r')
    ax[i].set_title(f'Rozkład zarobków w zależności od {titles[i]}')
    ax[i].set_yticks(range(1, len(labels) + 1))
    ax[i].set_yticklabels(labels)
    ax[i].set_ylabel(x_labels[i])
    ax[i].set_xlabel('Zarobki')

plt.subplots_adjust(hspace=0.3)
plt.subplots_adjust(wspace=0.3)
plt.show()

In [None]:
df_cpy = jobs_df.copy()
company_size_order = [1, 10, 100, 1000, 10000, 100000]
df_cpy['company size'] = pd.Categorical(df_cpy['company size'], categories=company_size_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'company size'])[target].mean().unstack()
salary_matrix.round(1)

In [None]:
df_cpy = jobs_df.copy()
company_size_order = [1, 10, 100, 1000, 10000, 100000]
df_cpy['company size'] = pd.Categorical(df_cpy['company size'], categories=company_size_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'company size'])[target].count().unstack()
salary_matrix.round(1)

In [None]:
df_cpy = jobs_df.copy()
company_size_order = [1, 10, 100, 1000, 10000, 100000]
df_cpy['company size'] = pd.Categorical(df_cpy['company size'], categories=company_size_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'company size'])[target].count().unstack()
salary_matrix = salary_matrix.apply(lambda x: x / x.sum(), axis=1)
salary_matrix.round(2)

In [None]:
df_cpy = jobs_df.copy()
company_size_order = [1, 10, 100, 1000, 10000, 100000]
df_cpy['company size'] = pd.Categorical(df_cpy['company size'], categories=company_size_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'company size'])[target].count().unstack()
salary_matrix = salary_matrix.apply(lambda x: x / x.sum(), axis=0)
salary_matrix.round(2)

In [None]:
df_cpy = jobs_df.copy()
seniority_order = ['junior', 'mid', 'senior', 'expert']
df_cpy['seniority'] = pd.Categorical(df_cpy['seniority'], categories=seniority_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'seniority'])[target].count().unstack()
salary_matrix.round(1)

In [None]:
df_cpy = jobs_df.copy()
seniority_order = ['junior', 'mid', 'senior', 'expert']
df_cpy['seniority'] = pd.Categorical(df_cpy['seniority'], categories=seniority_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'seniority'])[target].std().unstack()
salary_matrix.round(1)

In [None]:
df_cpy = jobs_df.copy()
seniority_order = ['junior', 'mid', 'senior', 'expert']
df_cpy['seniority'] = pd.Categorical(df_cpy['seniority'], categories=seniority_order, ordered=True)
salary_matrix = df_cpy.groupby(['technology', 'seniority'])[target].mean().unstack()
salary_matrix.round(1)

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, MinMaxScaler, StandardScaler, FunctionTransformer
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error

In [None]:
jobs_df.dropna(subset=['location', 'technology', 'seniority']).isna().sum()

In [None]:
jobs_df.info()

In [None]:
jobs_filtered_df = jobs_df.dropna(subset=['location', 'technology', 'seniority', target])

In [None]:
X = jobs_filtered_df.filter(regex='^(?!.*salary).*').drop('company', axis=1)
X['company size'] = X['company size'].astype(str)
y = jobs_filtered_df[target]

In [None]:
X.head()

In [None]:
X.isna().sum()

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
categorical_features

In [None]:
X.info()

In [None]:
noop_transformer = FunctionTransformer(lambda x: x)

In [None]:
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder())
num_pipeline = make_pipeline(noop_transformer)

preprocessing = make_column_transformer(
    (cat_pipeline, make_column_selector(dtype_include=object)),
    (num_pipeline, make_column_selector(dtype_include=np.number))
)

In [None]:
labels = []
values = []
errs = []
for regressor in [LinearRegression(), KNeighborsRegressor(), RandomForestRegressor(), SVR(), Lasso(), Ridge(), MLPRegressor()]:
    name = str(regressor)[:-2]

    scores = cross_val_score(make_pipeline(preprocessing, regressor), X_train, y_train, n_jobs=-1, cv=10, scoring=make_scorer(mean_absolute_error))
    mean = scores.mean()
    std = scores.std()
    
    labels.append(name)
    values.append(mean)
    errs.append(std)

In [None]:
print(labels)

In [None]:
fig, ax = plt.subplots(figsize=(8, 4))
bar1 = ax.barh(labels, values, xerr=errs, height=0.5)
ax.set_label('Mean absolute Error')
ax.set_xlabel('Wartości metryki')
ax.set_ylabel('Model')
ax.set_title('Porównanie modeli')
ax.legend(['Mean absolute Error'], loc='lower left')
ax.bar_label(bar1)
plt.tight_layout()
plt.show()

In [None]:
pipeline = make_pipeline(preprocessing, RandomForestRegressor())

In [None]:
param_dist = {
    'randomforestregressor__n_estimators': [100, 200, 300],
    'randomforestregressor__max_depth': [None, 1, 3, 10],
    'randomforestregressor__min_samples_split': [2, 5, 10],
    'randomforestregressor__min_samples_leaf': [1, 2, 4],
    'randomforestregressor__max_features': [1, 'sqrt', 'log2']
}
random_search = RandomizedSearchCV(estimator=pipeline, param_distributions=param_dist, n_iter=10, cv=10, n_jobs=-1, scoring='neg_mean_absolute_error', verbose=2)
random_search.fit(X_train, y_train)
best_rf = random_search.best_estimator_
print(random_search.best_params_)

In [None]:
print(random_search.best_score_)

In [None]:
best_rf = make_pipeline(preprocessing, RandomForestRegressor(n_estimators=300, max_depth=None, min_samples_split=2, min_samples_leaf=2, max_features='log2'))
best_rf.fit(X_train, y_train)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, best_rf.predict(X_test), color='blue')
plt.plot(y_test, y_test, color='red', linestyle='--')
plt.title('Poprawne wartości a przewidziane wartości')
plt.xlabel('Poprawne wartości')
plt.ylabel('Przewidziane wartości')
plt.grid(True)
plt.show()

In [None]:
best_rf_rf = list(best_rf.named_steps.items())[-1][1]

In [None]:
list(best_rf.named_steps.values())[0].transform(X[:1]).todense().shape

In [None]:
list(best_rf.named_steps.values())[0].transform(X[:1]).todense().shape

In [None]:
X

In [None]:
importances = best_rf_rf.feature_importances_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 6))
plt.title("Znaczenie cech")
plt.barh(indices, importances, align="center")
plt.xlabel("Znaczenie")
plt.ylabel("Cecha")
plt.show()

In [None]:
best_lin = make_pipeline(preprocessing, LinearRegression())
best_lin.fit(X_train, y_train)

In [None]:
list(best_lin.named_steps.values())[0]

In [None]:
importances = list(best_lin.named_steps.values())[1].coef_
indices = np.argsort(importances)[::-1]
plt.figure(figsize=(12, 6))
plt.title("Znaczenie cech")
plt.barh(indices, importances, align="center")
plt.xlabel("Znaczenie")
plt.ylabel("Cecha")
plt.show()

In [None]:
from sklearn.tree import export_graphviz
import graphviz
from sklearn.tree import plot_tree

# Export the structure of a single tree from the forest
estimator = best_rf_rf.estimators_[-1]

# Method 1: Using graphviz
dot_data = export_graphviz(estimator, out_file=None, 
                           feature_names=[f'Feature {i}' for i in range(len(best_rf_rf.feature_importances_))],
                           filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render("tree", format='png')

# # Method 2: Using plot_tree
# plt.figure(figsize=(20, 10))
# plot_tree(estimator, feature_names=[f'Feature {i}' for i in range(len(best_rf_rf.feature_importances_))], filled=True)
# plt.show()

In [None]:
best_rf.predict(X_test)

In [None]:
best_rf

In [None]:
list(best_rf.named_steps.values())[0]


In [None]:
list(best_rf.named_steps.values())[0].transform(X).todense().shape

In [None]:
import shap
explainer = shap.TreeExplainer(list(best_rf.named_steps.values())[1], list(best_rf.named_steps.values())[0].transform(X[:100]).todense())

In [None]:
shap_values = explainer(list(best_rf.named_steps.values())[0].transform(X[:500]).todense())

In [None]:
shap.plots.bar(shap_values)

In [None]:
shap.plots.waterfall(shap_values[0])