In [None]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import folium
from folium.plugins import MarkerCluster
import json

In [None]:
def fetch_adzuna_jobs(keyword, session, country="us", pages=2):
    all_jobs = []

    for page in range(1, pages + 1):
        url = f"https://api.adzuna.com/v1/api/jobs/{country}/search/{page}"

        try:
            response = session.get(url, params={"what": keyword}, timeout=10)
            data = response.json()
        except requests.RequestException as e:
            print(f"Ошибка запроса на странице {page}: {e}")
            break

        results = data.get("results", [])
        if not results:
            break

        for job in results:
            job["search_keyword"] = keyword

        all_jobs.extend(results)

    return all_jobs

In [None]:
app_id = "25d45046"
app_key = "7bd6e22083f1d7657b1d2a572615a15c"

session = requests.Session()
session.params = {
    "app_id": app_id,
    "app_key": app_key,
    "results_per_page": 50,
}

keywords = [
    "data analyst",
    "software engineer",
    "marketing specialist",
    "project manager",
    "product manager",
    "graphic designer",
    "ux/ui designer",
    "data scientist",
    "financial analyst",
    "sales manager"
]

all_jobs = []
countries = [
    "au",  # Australia
    "at",  # Austria
    "be",  # Belgium
    "br",  # Brazil
    "ca",  # Canada
    "ch",  # Switzerland
    "fr",  # France
    "de",  # Germany
    "es",  # Spain
    "in",  # India
    "it",  # Italy
    "mx",  # Mexico
    "nl",  # Netherlands
    "nz",  # New Zealand
    "pl",  # Poland
    "sg",  # Singapore
    "za",  # South Africa
    "gb",  # United Kingdom
    "uk",  # United Kingdom (альтернативный код)
    "us"  # United States
]

all_jobs = []
for country in countries:
    for kw in keywords:
        print(f"Запрос — страна {country}, keyword '{kw}'")
        jobs = fetch_adzuna_jobs(kw, session, country)
        all_jobs.extend(jobs)
        print(f"  {len(jobs)} вакансий для {country}/{kw}")
        print()

df = pd.json_normalize(all_jobs)

In [None]:
# df.to_csv("jobs.csv", index=False)

In [None]:
df = pd.read_csv("jobs.csv")

In [None]:
df

In [None]:
df.info()

In [None]:
drop_list = [
    'description',
    'id',
    'redirect_url',
    'adref',
    'salary_is_predicted',
    '__CLASS__',
    'category.tag',
    'category.__CLASS__',
    'company.__CLASS__',
    'location.display_name',
    'location.__CLASS__',
    'contract_time',
    'contract_type']

df.drop(drop_list, axis=1, inplace=True)

In [None]:
df

In [None]:
df['location.area']

In [None]:
df.info()

In [None]:
clean_location = (
    df["location.area"]
    .astype(str)
    .str.replace(r"[\[\]']", "", regex=True)  # убираем [, ], '
)
location_split = clean_location.str.split(",", expand=True)

df["country"] = location_split[0].str.strip()
df["state"] = location_split[1].str.strip()
df["area"] = location_split[2].str.strip()
df["city"] = location_split[3].str.strip()

df

In [None]:
df = df.rename(columns={
    "company.display_name": "company",
    "category.label": "category",
    "search_keyword": "keyword",
    "created": "created_at"
})

df = df[
    [
        "title", "company", "category", "keyword",
        "country", "state", "area", "city",
        "salary_min", "salary_max", "created_at",
        "latitude", "longitude"
    ]
]

In [None]:
df

In [None]:
df.info()

In [None]:
df.loc[df['city'] == "North Canberra", 'salary_min']

In [None]:
df

In [None]:
num_cols = ['salary_min', 'salary_max']

for col in num_cols:
    # 1. City
    df[col] = df.groupby(['country', 'state', 'area', 'city'])[col].transform(
        lambda x: x.fillna(x.median() if x.notna().any() and len(x) > 5 and (x.max() < x.median() * 10) else np.nan)
    )

    # 2. Area
    df[col] = df.groupby(['country', 'state', 'area'])[col].transform(
        lambda x: x.fillna(x.median() if x.notna().any() and len(x) > 5 and (x.max() < x.median() * 10) else np.nan)
    )

    # 3. State
    df[col] = df.groupby(['country', 'state'])[col].transform(
        lambda x: x.fillna(x.median() if x.notna().any() and len(x) > 5 and (x.max() < x.median() * 10) else np.nan)
    )

    # 4. Country
    df[col] = df.groupby(['country'])[col].transform(
        lambda x: x.fillna(x.median() if x.notna().any() and len(x) > 5 and (x.max() < x.median() * 10) else np.nan)
    )

In [None]:
df.loc[df['city'] == "North Canberra", 'salary_min']

In [None]:
# Вычисляем моду категории по каждому ключевому слову, исключая "Unknown"
mode_per_keyword = df[df['category'] != 'Unknown'].groupby('keyword')['category'].agg(
    lambda x: x.mode().iloc[0] if not x.mode().empty else 'Not Provided'
)

# Заменяем 'Unknown' на моду по ключевому слову
df['category'] = df.apply(
    lambda row: mode_per_keyword[row['keyword']] if row['category'] == 'Unknown' else row['category'],
    axis=1
)

In [None]:
df['state'] = df['state'].fillna('Not Provided')
df['area'] = df['area'].fillna('Not Provided')
df['city'] = df['city'].fillna('Not Provided')
df.dropna(inplace=True)
df.drop_duplicates()
df.isna().sum()

In [None]:
df

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], utc=True, errors='coerce').dt.tz_convert('Asia/Tashkent')
df.created_at

In [None]:
df.info()

In [None]:
df

In [None]:
# Топ 10 наиболее популярных должностей
top_titles = df['title'].value_counts().head(10)
top_titles

In [None]:
# Компании с наибольшим количеством вакансий
top_companies = df['company'].value_counts().head(10)
top_companies

In [None]:
# Наиболее часто встречающиеся навыки (ключевые слова)
top_skills = df['keyword'].value_counts().head(10)
top_skills

In [None]:
# Города с наибольшим количеством вакансий, исключая "Not Provided"
top_cities = df.loc[df['city'] != 'Not Provided', 'city'].value_counts().head(10)
top_cities

In [None]:
# Страны с наибольшим количеством вакансий
top_countries = df['country'].value_counts()
top_countries

In [None]:
# Средняя зарплата по минимуму и максимуму
avg_salary_min = round(df['salary_min'].mean(), 2)
avg_salary_max = round(df['salary_max'].mean(), 2)
print(f"Средняя зарплата:\n"
      f"    min:   {avg_salary_min}\n"
      f"    max:   {avg_salary_max}")

In [None]:
# Вакансии с самой высокой средней зарплатой
df['salary_avg'] = df[['salary_min', 'salary_max']].mean(axis=1)
top_paid_jobs = df.groupby('title')['salary_avg'].mean().sort_values(ascending=False)
top_paid_jobs

In [None]:
# Вопрос: В какие дни недели публикуется больше всего вакансий?
df['created_weekday'] = df['created_at'].dt.day_name()

weekday_counts = df['created_weekday'].value_counts()

weekday_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
weekday_counts = weekday_counts.reindex(weekday_order)

weekday_counts

In [None]:
# Вопрос: В какие месяцы публикуется больше всего вакансий?
df['created_month'] = df['created_at'].dt.month_name()

month_counts = df['created_month'].value_counts()

month_order = ['January', 'February', 'March', 'April', 'May', 'June',
               'July', 'August', 'September', 'October', 'November', 'December']
month_counts = month_counts.reindex(month_order)
month_counts

In [None]:
# Создаем диапазоны минимальных зарплат
bins = [0, 50000, 100000, 150000, 200000, 500000, 1000000]
labels = ['0-50k', '50-100k', '100-150k', '150-200k', '200-500k', '500k+']

df['salary_min_range'] = pd.cut(df['salary_min'], bins=bins, labels=labels)
salary_range_counts = df['salary_min_range'].value_counts().sort_index()
salary_range_counts

In [None]:
avg_salary_by_company = df.groupby('company')['salary_avg'].mean().sort_values(ascending=False)
avg_salary_by_company

In [None]:
top_titles = df['title'].value_counts().head(10)

plt.figure(figsize=(10, 6))
top_titles.plot(kind='bar', color='skyblue')
plt.title("Топ-10 популярных должностей")
plt.ylabel("Количество вакансий")
plt.xticks(rotation=45, ha='right')
plt.show()

In [None]:
!pip install wordcloud

In [None]:
from wordcloud import WordCloud

text = " ".join(df['keyword'].dropna())
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(15, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Наиболее часто встречающиеся навыки")
plt.show()

In [None]:
df['salary_avg'] = df[['salary_min', 'salary_max']].mean(axis=1)

top_titles = df['title'].value_counts().head(10).index
df_box = df[
    df['title'].isin(top_titles) &
    df['salary_avg'].notna() &
    (df['salary_avg'] <= 200000)
    ]

plt.figure(figsize=(25, 10))
df_box.boxplot(column='salary_avg', by='title', vert=True)

plt.xticks(rotation=45, ha='right', fontsize=10)
plt.subplots_adjust(left=0.15, right=0.95, bottom=0.35, top=0.9)

plt.xlabel('Job Title')
plt.ylabel('Average Salary')
plt.title('Salary Variation by Job Title')
plt.suptitle('')
plt.show()

In [None]:
df.country.unique()

In [None]:
df

In [None]:
import folium
from folium.plugins import MarkerCluster

# Цвета для keyword
keywords = df['keyword'].unique()
colors = [
    "#FF0000", "#00FF00", "#0000FF", "#FFA500", "#FF00FF",
    "#00FFFF", "#FFFF00", "#8A2BE2", "#00FF7F", "#DC143C",
    "#1E90FF", "#FFD700", "#ADFF2F", "#FF4500", "#00BFFF",
    "#8B0000", "#006400", "#4B0082", "#2F4F4F", "#FF1493"
]
keyword_colors = {k: colors[i % len(colors)] for i, k in enumerate(keywords)}

# Создаём карту
m = folium.Map(location=[20, 0], zoom_start=2, min_zoom=2, scrollWheelZoom=True)
marker_cluster = MarkerCluster().add_to(m)

# Добавляем все точки
for _, row in df.iterrows():
    if pd.notna(row['latitude']) and pd.notna(row['longitude']):
        color = keyword_colors.get(row['keyword'], "#000000")
        popup_html = f"""
        <table style="width:420px; font-size:14px;">
            <tr><th>Company</th><td>{row['company']}</td></tr>
            <tr><th>Title</th><td>{row['title']}</td></tr>
            <tr><th>Category</th><td>{row['category']}</td></tr>
            <tr><th>Keyword</th><td>{row['keyword']}</td></tr>
            <tr><th>Country</th><td>{row['country']}</td></tr>
            <tr><th>City</th><td>{row['city']}</td></tr>
            <tr><th>Salary Min</th><td>{row['salary_min']}</td></tr>
            <tr><th>Salary Max</th><td>{row['salary_max']}</td></tr>
            <tr><th>Salary Avg</th><td>{row['salary_avg']}</td></tr>
            <tr><th>Created At</th><td>{row['created_at'].strftime('%H:%M %d/%m/%Y')}</td></tr>
        </table>
        """
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=7,
            color=color,
            fill=True,
            fill_opacity=0.85,
            popup=folium.Popup(popup_html, max_width=550)
        ).add_to(marker_cluster)

# Легенда
legend_html = '''
<div style="
position: fixed;
bottom: 40px;
left: 40px;
width: 230px;
background-color: white;
border:2px solid grey;
z-index:9999;
font-size:12px;
max-height: 300px;
overflow-y: auto;
padding: 5px;">
<b>Keyword Colors</b><br>
'''
for k, c in keyword_colors.items():
    legend_html += f'<i style="background:{c};width:12px;height:12px;display:inline-block;margin-right:4px;"></i>{k}<br>'
legend_html += '</div>'

m.get_root().html.add_child(folium.Element(legend_html))

m

### Выводы (Insights)

1. **Наиболее востребованные вакансии:** доминируют управленческие и аналитические роли — Project Manager, Product Manager, Data Scientist, Software Engineer.
2. **Популярные навыки:** технические и управленческие компетенции — Data Analyst, Software Engineer, UX/UI Designer, Product Manager.
3. **Локации с наибольшим количеством вакансий:** крупные города и развитые страны — Sydney, Zürich, Barcelona, Melbourne; США, Франция, Германия, Канада.
4. **Уровень опыта:** старшие и специализированные позиции получают более высокие зарплаты; начальные роли встречаются реже.
5. **Зарплаты:** средние значения — 90 000–110 000 USD; для senior и специализированных ролей — значительно выше.
6. **Сезонные тенденции:** пик публикаций приходится на пятницу и на октябрь–ноябрь.

**Общий вывод:** рынок вакансий сосредоточен на технических и управленческих позициях в крупных городах. Студентам рекомендуется развивать востребованные навыки и учитывать географические и сезонные тенденции при поиске работы.
