#### Initial Descriptive Stats

1. Loading Data

Since the processed job ad data are stored in CSV files, first import the necessary libraries and read the data.

In [None]:
import pandas as pd
import os

# Set the path to your processed CSV files
data_folder = "path/to/your/processed/csv/files"  # Update this

# List all CSV files in the folder
csv_files = [f for f in os.listdir(data_folder) if f.endswith('.csv')]

# Load all CSV files into a single DataFrame
df_list = [pd.read_csv(os.path.join(data_folder, file)) for file in csv_files]
df = pd.concat(df_list, ignore_index=True)  # Merge them into one DataFrame

# Display first few rows
df.head()

2. Check Basic Info

This will give an overview of columns, data types, and missing values.

In [None]:
df.info()

3. Summary Statistics

To get a summary of numeric columns:

In [2]:
df.describe()

# For categorical columns

df.describe(include="object")


NameError: name 'df' is not defined

4. Check Missing Values

See if there are missing values in your dataset:

In [None]:
df.isnull().sum()

5. Distribution of Job Titles

Check the most common job titles:

In [None]:
df["job_title"].value_counts().head(20)  # Top 20 job titles

6. Plotting Descriptive Statistics

Creating visualizations to explore the data.

In [None]:
## Distribution of job titles
import matplotlib.pyplot as plt

df["job_title"].value_counts().head(10).plot(kind="barh", figsize=(10,5), title="Top 10 Job Titles")
plt.xlabel("Number of Job Ads")
plt.ylabel("Job Title")
plt.show()

In [None]:
## Job postings over time
# Note: If there is a date column (e.g., date_posted), make sure it's in datetime format:

df["date_posted"] = pd.to_datetime(df["date_posted"], errors="coerce")
df["date_posted"].hist(bins=30, figsize=(10,5))
plt.title("Distribution of Job Postings Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Job Ads")
plt.show()

In [None]:
##Word Cloud of Job Titles

from wordcloud import WordCloud

text = " ".join(df["job_title"].dropna())  # Combine job titles into one string
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud of Job Titles")
plt.show()

7. Check for Duplicates

In [None]:
df.duplicated().sum()

In [None]:
# If needed, remove duplicates
df = df.drop_duplicates()

8. Correlation Between Numeric Features

In [None]:
import seaborn as sns

plt.figure(figsize=(8,6))
sns.heatmap(df.corr(), annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Matrix of Numeric Features")
plt.show()

#### Some more specific descriptive stats

1. Load Data & Preprocess

Ensure the dataset is loaded correctly, assuming there is a CSV with a column for job descriptions (description) and a column for the year (year).

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
from wordcloud import WordCloud
from textstat import flesch_reading_ease

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset (adjust file path as needed)
df = pd.read_csv("your_job_ads.csv")  

# Ensure 'year' is a numeric column
df['year'] = pd.to_numeric(df['year'], errors='coerce')

# Drop rows with missing descriptions
df = df.dropna(subset=['description'])

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'seaborn'

2. Word Count Distribution

In [None]:
# Create word count column
df['word_count'] = df['description'].apply(lambda x: len(word_tokenize(x)))

# Summary statistics
print(df.groupby('year')['word_count'].describe())

# Plot histogram
plt.figure(figsize=(10, 5))
sns.histplot(data=df, x='word_count', hue='year', bins=30, kde=True)
plt.title("Word Count Distribution by Year")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.legend(title="Year")
plt.show()

3. Lexical Diversity

Lexical diversity is the ratio of unique words to total words in a text.

In [None]:
def lexical_diversity(text):
    words = word_tokenize(text)
    return len(set(words)) / len(words) if len(words) > 0 else 0

df['lexical_diversity'] = df['description'].apply(lexical_diversity)

# Summary statistics
print(df.groupby('year')['lexical_diversity'].describe())

# Boxplot of lexical diversity
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='year', y='lexical_diversity')
plt.title("Lexical Diversity by Year")
plt.xlabel("Year")
plt.ylabel("Lexical Diversity (Unique Words / Total Words)")
plt.show()

4. Readability Scores (Flesch-Kincaid)

A lower score means harder-to-read text, while a higher score means easier readability.

In [None]:
df['readability'] = df['description'].apply(flesch_reading_ease)

# Summary statistics
print(df.groupby('year')['readability'].describe())

# Boxplot of readability scores
plt.figure(figsize=(8, 5))
sns.boxplot(data=df, x='year', y='readability')
plt.title("Readability Scores by Year")
plt.xlabel("Year")
plt.ylabel("Flesch-Kincaid Readability Score")
plt.show()

5. Most Frequent Words

In [None]:
stop_words = set(stopwords.words('danish'))  # Change to 'english' if needed

def get_most_common_words(texts, n=20):
    words = []
    for text in texts:
        words.extend([word.lower() for word in word_tokenize(text) if word.isalnum() and word.lower() not in stop_words])
    return Counter(words).most_common(n)

# Get top words for each year
for year in df['year'].unique():
    print(f"\nMost common words in {year}:")
    print(get_most_common_words(df[df['year'] == year]['description']))

# Word cloud for recent job ads (2024/2025)
recent_texts = ' '.join(df[df['year'] >= 2024]['description'])
wordcloud = WordCloud(stopwords=stop_words, background_color='white', width=800, height=400).generate(recent_texts)

plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud for Recent Job Ads (2024/2025)")
plt.show()