# Python Job Market â€” Data Preview & Analysis

This notebook provides a **preview of the scraped job vacancies data**
and demonstrates **initial analytical insights**:

- structure of collected data
- technology demand
- experience level distribution

The data was collected using **async web scraping**
from public job platforms.


In [1]:
import sys
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
sys.path.append(str(PROJECT_ROOT))

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import config
from collections import Counter

from analysis.preprocessing import load_raw_data
from analysis.technologies import (
    extract_technologies,
    counter_to_df,
    technologies_by_experience,
    extract_technologies_from_text
)
from analysis.experience import classify_experience
from analysis.visualization import (
    plot_top_technologies,
    plot_experience_distribution,
    plot_counter
)

In [None]:
df = load_raw_data(config.DATA_DIR / "raw")
df.head()

In [None]:
print(f"Total vacancies: {len(df)}")
df["source"].value_counts()

In [None]:
tech_counter = extract_technologies(df)

tech_df = counter_to_df(tech_counter)
tech_df.head()

In [None]:
top_n = 20
top_tech = tech_df.sort_values("count", ascending=False).head(top_n)

plt.figure(figsize=(10, 6))
plt.barh(top_tech["technology"], top_tech["count"])
plt.gca().invert_yaxis()
plt.title("Top Technologies in Python Vacancies")
plt.xlabel("Mentions")
plt.tight_layout()
plt.savefig(config.IMAGES_DIR / "top_technologies.png")
plt.show()

In [None]:
df["technologies"] = df["technologies"].apply(lambda x: x if isinstance(x, list) else [])
df["experience_level"] = df["description"].apply(classify_experience)
df["experience_level"].value_counts()

In [8]:
mask = df["source"] == "dou.ua"

df.loc[mask, "technologies"] = df.loc[mask, "description"].apply(
    extract_technologies_from_text
)

In [9]:
plot_counter( 
    technologies_by_experience(df, "junior"), 
    title="Top Technologies (Junior)",
    output_path=config.IMAGES_DIR / "junior_tech.png" 
)

In [10]:
plot_counter( 
    technologies_by_experience(df, "middle"), 
    title="Top Technologies (Middle)",
    output_path=config.IMAGES_DIR / "middle_tech.png" 
)

In [11]:
plot_counter( 
    technologies_by_experience(df, "senior"), 
    title="Top Technologies (Senior)",
    output_path=config.IMAGES_DIR / "senior_tech.png" 
)

In [None]:
for source in df["source"].unique():
    subset = df[df["source"] == source]
    counter = Counter(
        tech for techs in subset["technologies"] for tech in techs
    )
    plot_counter(counter, f"Top technologies on {source}")
