In [2]:
import pandas as pd
import re
import os
import zipfile
import json

# --- Configuration ---
# You will need to set up your Kaggle API key to download datasets.
# 1. Go to Kaggle.com -> Your Profile -> Account -> Create New API Token.
# 2. This will download a kaggle.json file.
# 3. Place this file in ~/.kaggle/ (Linux/macOS) or C:\Users\<Windows-username>\.kaggle\ (Windows).
#    Alternatively, set KAGGLE_USERNAME and KAGGLE_KEY environment variables.
# For this script to run in a controlled environment, we'll simulate the data loading
# if Kaggle API is not configured or if we're in a restricted environment.
# In your local setup, you would use `kaggle datasets download ...`

DATASET_NAME = "shamimhasan8/resume-vs-job-description-matching-dataset"
DOWNLOAD_PATH = "./data"
CSV_FILE_NAME = "resume_job_matching_dataset.csv"

# --- Helper Functions ---

def download_kaggle_dataset(dataset_name, path):
    """
    Downloads a Kaggle dataset using the Kaggle API.
    This function assumes Kaggle API is configured (kaggle.json or env vars).
    """
    if not os.path.exists(path):
        os.makedirs(path)
    print(f"Attempting to download {dataset_name} to {path}...")
    try:
        # This command requires the 'kaggle' package to be installed: pip install kaggle
        # It also requires your Kaggle API key to be set up.
        os.system(f"kaggle datasets download -d {dataset_name} -p {path}")
        print("Download command executed. Checking for zip file...")
        # Find the downloaded zip file
        zip_files = [f for f in os.listdir(path) if f.endswith('.zip')]
        if zip_files:
            zip_path = os.path.join(path, zip_files[0])
            print(f"Found zip file: {zip_path}. Unzipping...")
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(path)
            print("Dataset unzipped successfully.")
            os.remove(zip_path) # Clean up the zip file
            return True
        else:
            print("No zip file found after download command. Please ensure Kaggle API is configured correctly.")
            return False
    except Exception as e:
        print(f"Error downloading Kaggle dataset: {e}")
        print("Please ensure you have the 'kaggle' package installed (`pip install kaggle`)")
        print("and your Kaggle API key is configured (see comments in script).")
        return False

def clean_text(text):
    """
    Performs basic text cleaning:
    - Removes extra whitespace
    - Converts to lowercase
    - Removes special characters (keeping alphanumeric and basic punctuation)
    """
    if not isinstance(text, str):
        return ""
    text = text.lower() # Convert to lowercase
    text = re.sub(r'\s+', ' ', text).strip() # Remove extra whitespace
    # You might want to refine this regex based on what characters are truly
    # irrelevant for your NLP task. For now, keep letters, numbers, and some basic punctuation.
    text = re.sub(r'[^a-z0-9\s.,;\'"!?]', '', text)
    return text

# --- Main Script ---

def main():
    # Check if the CSV file already exists
    csv_path = os.path.join(DOWNLOAD_PATH, CSV_FILE_NAME)
    if not os.path.exists(csv_path):
        print(f"'{CSV_FILE_NAME}' not found. Attempting to download from Kaggle...")
        success = download_kaggle_dataset(DATASET_NAME, DOWNLOAD_PATH)
        if not success:
            print("Failed to download dataset. Please download it manually and place it in the 'data' folder.")
            print(f"You can download from: https://www.kaggle.com/datasets/{DATASET_NAME}")
            print(f"Ensure the CSV file is named '{CSV_FILE_NAME}' inside the '{DOWNLOAD_PATH}' directory.")
            # For demonstration purposes, let's create a dummy file if download fails
            print("Creating a dummy CSV for demonstration if download fails...")
            dummy_data = {
                'job_description': [
                    "We are looking for a highly motivated Software Engineer with strong Python skills and experience in machine learning.",
                    "Seeking a Data Scientist with expertise in statistical modeling, R, and data visualization. PhD preferred.",
                    "Junior Developer position, requiring basic understanding of web development (HTML, CSS, JS) and problem-solving skills."
                ],
                'resume': [
                    "Experienced Python developer with a background in deep learning projects and strong coding abilities.",
                    "Statistician with a Master's degree, proficient in R and data analysis, created interactive dashboards.",
                    "Recent graduate eager to learn, completed a bootcamp in front-end technologies including HTML and CSS."
                ],
                'match_score': [5, 4, 3]
            }
            pd.DataFrame(dummy_data).to_csv(csv_path, index=False)
            print("Dummy CSV created. Proceeding with dummy data.")
    else:
        print(f"'{CSV_FILE_NAME}' found. Loading existing data.")

    try:
        # Load the dataset
        df = pd.read_csv(csv_path)
        print("\nOriginal Data Head:")
        print(df.head())
        print(f"\nOriginal Data Shape: {df.shape}")

        # Apply text cleaning
        print("\nCleaning text data...")
        df['job_description_cleaned'] = df['job_description'].apply(clean_text)
        df['resume_cleaned'] = df['resume'].apply(clean_text)

        print("\nCleaned Data Head:")
        print(df[['job_description_cleaned', 'resume_cleaned', 'match_score']].head())

        print("\nData cleaning complete. You now have 'job_description_cleaned' and 'resume_cleaned' columns.")
        print("This DataFrame (df) can now be used for BERT embedding generation.")

        # Save the cleaned data for future use (optional)
        df.to_csv(os.path.join(DOWNLOAD_PATH, "cleaned_resume_job_data.csv"), index=False)
        print(f"\nCleaned data saved to {os.path.join(DOWNLOAD_PATH, 'cleaned_resume_job_data.csv')}")

    except FileNotFoundError:
        print(f"Error: The file '{csv_path}' was not found. Please ensure it's in the correct directory.")
    except Exception as e:
        print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()


'resume_job_matching_dataset.csv' found. Loading existing data.

Original Data Head:
                                     job_description  \
0  Data Analyst needed with experience in SQL, Ex...   
1  Data Scientist needed with experience in Stati...   
2  Software Engineer needed with experience in Sy...   
3  ML Engineer needed with experience in Python, ...   
4  Software Engineer needed with experience in RE...   

                                              resume  match_score  
0  Experienced professional skilled in SQL, Power...            4  
1  Experienced professional skilled in Python, De...            4  
2  Experienced professional skilled in wait, Git,...            5  
3  Experienced professional skilled in return, De...            4  
4  Experienced professional skilled in REST APIs,...            5  

Original Data Shape: (10000, 3)

Cleaning text data...

Cleaned Data Head:
                             job_description_cleaned  \
0  data analyst needed with experience