In [1]:
import pandas as pd
import random

In [2]:
# 1. Load your dataset
df = pd.read_csv("UnprocessedDataSet.csv")


# Clean column names if needed
df.columns = [col.strip().lower() for col in df.columns]
if 'category' not in df.columns:
    raise ValueError("Dataset must have a 'Category' column.")
if 'resume' not in df.columns:
    raise ValueError("Dataset must have a 'Resume' column.")

In [3]:
# 2. Create simple job descriptions per category (you can edit these)
job_descriptions = {
    'Data Science': "Responsible for building and deploying machine learning models using Python, data analysis, and AI techniques.",
    'HR': "Handles recruitment, employee relations, payroll, and training processes within an organization.",
    'Advocate': "Represents clients in legal proceedings, drafts legal documents, and provides advice on legal rights.",
    'Arts': "Creates, designs, and presents artistic works such as illustrations, paintings, or digital media.",
    'Web Designing': "Designs and develops user-friendly websites with HTML, CSS, and JavaScript.",
    'Mechanical Engineer': "Designs and maintains mechanical systems, equipment, and manufacturing processes.",
    'Sales': "Manages client relationships, generates leads, and meets sales targets.",
    'Health and fitness': "Provides physical training, nutrition advice, and fitness programs for individuals.",
    'Finance': "Analyzes financial data, prepares reports, and manages investment strategies.",
    'Software Engineer': "Develops, tests, and maintains software applications and systems."
}

In [4]:
# 3. Create job–resume pairs
pairs = []

for job_title, job_desc in job_descriptions.items():
    # Positive samples (qualified)
    qualified = df[df['category'].str.lower() == job_title.lower()]
    for _, row in qualified.iterrows():
        pairs.append({
            'job_title': job_title,
            'job_description': job_desc,
            'resume_text': row['resume'],
            'label': 1
        })

    # Negative samples (unqualified) — sample from other categories
    unqualified = df[df['category'].str.lower() != job_title.lower()].sample(
        n=min(len(qualified), len(df)//len(job_descriptions)), 
        random_state=42
    )
    for _, row in unqualified.iterrows():
        pairs.append({
            'job_title': job_title,
            'job_description': job_desc,
            'resume_text': row['resume'],
            'label': 0
        })


In [5]:
# 4. Create final dataframe
final_df = pd.DataFrame(pairs)
final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle


In [6]:
# 5. Save output
final_df.to_csv("ProcessedDataSet.csv", index=False)
print(f"✅ Created dataset with {len(final_df)} pairs and {final_df['job_title'].nunique()} job roles.")
print(final_df.head())

✅ Created dataset with 590 pairs and 8 job roles.
       job_title                                    job_description  \
0          Sales  Manages client relationships, generates leads,...   
1  Web Designing  Designs and develops user-friendly websites wi...   
2          Sales  Manages client relationships, generates leads,...   
3  Web Designing  Designs and develops user-friendly websites wi...   
4           Arts  Creates, designs, and presents artistic works ...   

                                         resume_text  label  
0  I.T. Skills â¢ Windows XP, Ms Office (Word, E...      0  
1  IT SKILLS Languages: C (Basic), JAVA (Basic) W...      1  
2  Education Details \r\n\r\nHadoop Developer \r\...      0  
3  Education Details \r\nFebruary 2006 to Februar...      0  
4  Additional qualifications April 2000, Web Desi...      1  
