In [None]:
# Combined TF-IDF + Semantic Similarity + Random Forest Classifier

In [None]:

# Required Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [None]:

# Load your dataset (replace with actual data loading logic)
# Example: train_data = pd.read_csv('train.csv'), or from HuggingFace or existing DataFrame

# Assumes: train_data and test_data contain 'resume_text', 'job_description_text', and 'label' columns


In [None]:

# Preprocessing
def preprocess_text(text):
    return text.lower().strip()

train_data['resume_text'] = train_data['resume_text'].apply(preprocess_text)
train_data['job_description_text'] = train_data['job_description_text'].apply(preprocess_text)
test_data['resume_text'] = test_data['resume_text'].apply(preprocess_text)
test_data['job_description_text'] = test_data['job_description_text'].apply(preprocess_text)


In [None]:

# TF-IDF Features
vectorizer = TfidfVectorizer(max_features=5000)
train_combined_text = (train_data['resume_text'] + " " + train_data['job_description_text']).tolist()
test_combined_text = (test_data['resume_text'] + " " + test_data['job_description_text']).tolist()
X_tfidf_train = vectorizer.fit_transform(train_combined_text)
X_tfidf_test = vectorizer.transform(test_combined_text)


In [None]:

# Semantic Similarity Features
embedder = SentenceTransformer('all-MiniLM-L6-v2')
resume_embeds = embedder.encode(test_data['resume_text'].tolist(), convert_to_tensor=False)
jd_embeds = embedder.encode(test_data['job_description_text'].tolist(), convert_to_tensor=False)
similarity_scores = np.diag(cosine_similarity(resume_embeds, jd_embeds)).reshape(-1, 1)


In [None]:

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_data['label'])
y_test = label_encoder.transform(test_data['label'])


In [None]:

# Combine TF-IDF + semantic similarity
X_combined_train = np.hstack([X_tfidf_train.toarray(), np.zeros((X_tfidf_train.shape[0], 1))])  # Pad training
X_combined_test = np.hstack([X_tfidf_test.toarray(), similarity_scores])  # Add similarity score to test set


In [None]:

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_combined_train, y_train)

# Evaluate
y_pred = rf_model.predict(X_combined_test)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
