### Data Exploration - Adzuna Jobs

### Objective
Understaning the job posting data structure, distributions, and quality.


In [None]:
!pip3 install matplotlib

In [None]:
import json
import pandas as pd
from collections import Counter
from pathlib import Path
import matplotlib.pyplot as plt

## Load Data

In [None]:
data_path = Path("../data/processed/adzuna_data_jobs.json")
with open(data_path, 'r', encoding='utf-8') as f:
    jobs_data = json.load(f)

df = pd.DataFrame(jobs_data)
print(f"\nColumns: {list(df.columns)}")

## Basic Statistics

In [None]:


sample = df.iloc[0]
print(f"Title: {sample['job_title']}")
print(f"Company: {sample['company']}")
print(f"Location: {sample['location']}")
print(f"Skills: {sample['skills']}")
print(f"Tags: {sample['tags']}")
print(f"Role Type: {sample['role_type']}")

In [None]:
# Check for missing values
print("Missing values:")
print(df.isnull().sum())


In [None]:

print(f"\nTotal unique companies: {df['company'].nunique()}")
print(f"Total unique locations: {df['location'].nunique()}")

## Distribution Analysis

In [None]:
# Role distribution
role_counts = df['role_type'].value_counts()

plt.figure(figsize=(10, 6))
role_counts.plot(kind='bar')
plt.title('Jobs by Role Type')
plt.xlabel('Role')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()



In [None]:
print("\nRole distribution:")
for role, count in role_counts.items():
    print(f"  {role}: {count}")

In [None]:
# Experience level distribution
exp_counts = df['experience_level'].value_counts(dropna=False)

print("Experience levels:")
for level, count in exp_counts.items():
    print(f"  {level}: {count}")

In [None]:
# Top locations
location_counts = df['location'].value_counts().head(10)

plt.figure(figsize=(10, 6))
location_counts.plot(kind='barh')
plt.title('Top 10 Locations')
plt.xlabel('Number of Jobs')
plt.tight_layout()
plt.show()

## Skills Analysis

In [None]:
# Extract all skills
all_skills = []
for skills_list in df['skills']:
    if isinstance(skills_list, list):
        all_skills.extend(skills_list)

print(f"Total skill mentions: {len(all_skills)}")
print(f"Unique skills: {len(set(all_skills))}")


In [None]:

# Count frequency
skill_freq = Counter(all_skills)
top_skills = skill_freq.most_common(20)

print("\nTop 20 skills:")
for i, (skill, count) in enumerate(top_skills, 1):
    pct = (count / len(df)) * 100
    print(f"{i:2d}. {skill:25s}: {count:3d} ({pct:.1f}%)")

In [None]:
# Visualize top skills
skills_names = [skill for skill, count in top_skills]
skills_counts = [count for skill, count in top_skills]

plt.figure(figsize=(12, 6))
plt.barh(skills_names, skills_counts)
plt.xlabel('Number of Jobs')
plt.title('Top 20 Most Demanded Skills')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## Description Analysis

In [None]:
# Check description lengths
df['desc_length'] = df['job_description'].str.len()

print("Description statistics:")
print(f"  Average: {df['desc_length'].mean():.0f} characters")
print(f"  Median: {df['desc_length'].median():.0f} characters")
print(f"  Min: {df['desc_length'].min()}")
print(f"  Max: {df['desc_length'].max()}")

# Plot distribution
plt.figure(figsize=(10, 6))
plt.hist(df['desc_length'], bins=30, edgecolor='black')
plt.xlabel('Description Length (characters)')
plt.ylabel('Frequency')
plt.title('Distribution of Job Description Lengths')
plt.tight_layout()
plt.show()

## Key Findings

In [None]:

print(f"\n1. Dataset size: {len(df)} jobs")
print(f"Most common role: {role_counts.index[0]} ({role_counts.values[0]} jobs)")

print(f"Top 3 skills:")
for i, (skill, count) in enumerate(top_skills[:3], 1):
    print(f"   {i}. {skill}: {count}")
print(f"Most jobs in: {location_counts.index[0]} ({location_counts.values[0]} jobs)")
print(f"Average skills per job: {len(all_skills) / len(df):.1f}")
