In [None]:
import os
import pickle
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier

# Define your clean_resume function here
def clean_resume(resume_text):
    # Remove URLs, hashtags, mentions, and punctuation
    resume_text = re.sub(
        r"http\S+|#\S+|@\S+|[%s]" % re.escape(string.punctuation),
        " ",
        resume_text,
        flags=re.IGNORECASE,
    )
    # Remove non-ASCII characters
    resume_text = re.sub(r"[^\x00-\x7f]", " ", resume_text)
    # Remove extra whitespace
    resume_text = re.sub("\s+", " ", resume_text)
    return resume_text

# Load the trained model and vectorizer
MODEL_PATH = "C:\\Users\\user\\Resume Classification\\trained_model.pkl"  # Path to the trained model
with open(MODEL_PATH, "rb") as f:
    ensemble, vectorizer = pickle.load(f)

# Define input directory and categories
INPUT_DIRECTORY = "C:\\Users\\user\\Resume Classification\\input_folder"  # Path to the directory containing resumes
CATEGORIES = ["accountant", "banking", "sales", "teacher"]  # List of categories

# Create a DataFrame to store categorized resumes
categorized_resumes = []

# Loop through resumes, clean, categorize, and move
for filename in os.listdir(INPUT_DIRECTORY):
    if filename.endswith(".txt"):  # Assuming resumes are in .txt format
        with open(os.path.join(INPUT_DIRECTORY, filename), "r") as file:
            resume_content = file.read()
        cleaned_resume = clean_resume(resume_content)

        vectorized_resume = vectorizer.transform([cleaned_resume])
        predicted_category = ensemble.predict(vectorized_resume)[0]

        # Create the category folder if it doesn't exist
        category_folder = os.path.join(INPUT_DIRECTORY, predicted_category)
        os.makedirs(category_folder, exist_ok=True)

        # Move the resume to the respective category folder
        os.rename(
            os.path.join(INPUT_DIRECTORY, filename),
            os.path.join(category_folder, filename)
        )

        # Track categorized resumes
        categorized_resumes.append({"filename": filename, "category": predicted_category})

# Write categorized resumes to CSV
categorized_resumes_df = pd.DataFrame(categorized_resumes)
categorized_resumes_df.to_csv("categorized_resumes.csv", index=False)