# 01 - Exploratory Data Analysis
Quick exploration of job postings sourced from the scraper or a baked-in sample dataset. This notebook now includes deeper analysis of job titles, companies, locations, and text content.

In [None]:
!pip install wordcloud seaborn matplotlib

In [None]:
from pathlib import Path
import sys
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Resolve repo root whether running from notebook dir or project root
repo_root = Path(__file__).resolve().parents[1] if "__file__" in globals() else Path.cwd().resolve()
if not (repo_root / "src").exists():
    for parent in repo_root.parents:
        if (parent / "src").exists():
            repo_root = parent
            break

if str(repo_root) not in sys.path:
    sys.path.append(str(repo_root))

print(f"Repo root: {repo_root}")

In [None]:
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

from src.storage.json_repository import JSONJobStore
from src.preprocessing.text_cleaner import TextCleaner
from src.preprocessing.skill_extractor import SkillExtractor

# Set plot style
sns.set_theme(style="whitegrid")
plt.rcParams['figure.figsize'] = [10, 6]

## 1. Load Data

In [None]:
# Construct absolute path to the data file using repo_root
data_path = repo_root / "data/processed/adzuna_data_jobs.json"
print(f"Loading data from: {data_path}")

store = JSONJobStore(json_path=data_path)
records = store.recent(limit=1000)  # Increased limit to get more data

if records:
    df = pd.DataFrame(
        {
            "job_title": [r.job_title for r in records],
            "job_description": [r.job_description for r in records],
            "company": [r.company for r in records],
            "location": [r.location for r in records],
            "posted_date": [r.posted_date for r in records],
            "source": [r.source for r in records],
            "fetched_at": [r.fetched_at for r in records],
        }
    )
    print(f"Loaded {len(df)} job records.")
else:
    print("No records found in JSONJobStore.")
    df = pd.DataFrame() # Create empty DF to prevent errors

df.head()

## 2. Categorical Analysis
Analyzing the distribution of Job Titles, Companies, and Locations.

In [None]:
if not df.empty:
    plt.figure(figsize=(12, 6))
    df['job_title'].value_counts().head(20).plot(kind='barh', color='skyblue')
    plt.title('Top 20 Job Titles')
    plt.xlabel('Count')
    plt.ylabel('Job Title')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
if not df.empty:
    plt.figure(figsize=(12, 6))
    df['company'].value_counts().head(20).plot(kind='barh', color='salmon')
    plt.title('Top 20 Hiring Companies')
    plt.xlabel('Count')
    plt.ylabel('Company')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
if not df.empty:
    plt.figure(figsize=(12, 6))
    df['location'].value_counts().head(20).plot(kind='barh', color='lightgreen')
    plt.title('Top 20 Job Locations')
    plt.xlabel('Count')
    plt.ylabel('Location')
    plt.gca().invert_yaxis()
    plt.show()

## 3. Quantitative Analysis
Analyzing the length of job descriptions.

In [None]:
if not df.empty:
    df['desc_length'] = df['job_description'].fillna('').apply(len)

    plt.figure(figsize=(10, 6))
    sns.histplot(df['desc_length'], bins=30, kde=True, color='purple')
    plt.title('Distribution of Job Description Lengths')
    plt.xlabel('Character Count')
    plt.ylabel('Frequency')
    plt.show()

## 4. Text Analysis & Word Cloud

In [None]:
if not df.empty:
    cleaner = TextCleaner(stopwords={"and", "with", "the", "to", "in", "for", "of", "a", "an", "on", "is", "are"})
    df["clean_description"] = df["job_description"].fillna("").apply(cleaner.clean)
    
    # Extract all text for word cloud
    all_text = " ".join(df["clean_description"])
    
    wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_text)
    
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.title("Word Cloud of Job Descriptions")
    plt.show()

## 5. Skill Extraction Analysis

In [None]:
if not df.empty:
    extractor = SkillExtractor()
    skill_counts = Counter()
    
    for text in df["job_description"].dropna():
        for match in extractor.extract(text):
            skill_counts[match.skill] += match.occurrences

    skill_df = pd.DataFrame(
        [
            {"skill": skill, "count": count}
            for skill, count in skill_counts.most_common(20)
        ]
    )
    
    if not skill_df.empty:
        plt.figure(figsize=(12, 6))
        sns.barplot(data=skill_df, x='count', y='skill', palette='viridis')
        plt.title('Top 20 Extracted Skills')
        plt.xlabel('Frequency')
        plt.ylabel('Skill')
        plt.show()
    else:
        print("No skills extracted.")