In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

## 1. Load and Explore Data

In [None]:
# Load the dataset
df = pd.read_csv('../data/job_descriptions.csv')

# Display basic information
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nFirst few rows:")
df.head()

## 2. Basic Data Analysis

In [None]:
# Check for missing values
print("Missing values per column:")
df.isnull().sum()

In [None]:
# Analyze job titles distribution
plt.figure(figsize=(12, 6))
df['Job Title'].value_counts().head(10).plot(kind='bar')
plt.title('Top 10 Job Titles')
plt.xticks(rotation=45)
plt.show()

## 3. Text Preprocessing

In [None]:
# Download required NLTK data
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Add job-specific stopwords
job_stopwords = {'job', 'position', 'role', 'looking', 'seeking', 'hiring', 'company', 'work', 'experience'}
stop_words.update(job_stopwords)

In [None]:
# Clean text function
def clean_text(text):
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove special characters but keep hyphens
    text = re.sub(r'[^\w\s-]', ' ', text)
    
    # Remove extra whitespace
    text = ' '.join(text.split())
    
    # Tokenize and lemmatize
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words and len(token) > 2]
    
    return ' '.join(tokens)

In [None]:
# Apply text cleaning
df['cleaned_text'] = df['Job Description'].apply(clean_text)

# Compare original and cleaned text
print("Original text:")
print(df['Job Description'].iloc[0][:200], "...")
print("\nCleaned text:")
print(df['cleaned_text'].iloc[0][:200], "...")

## 4. Save Processed Data

In [None]:
# Save processed data
df.to_csv('../data/processed_job_descriptions.csv', index=False)
print("Processed data saved successfully!")