<a href="https://colab.research.google.com/github/pandeyp84/Deep-Learning-Tutorials/blob/main/TA_Raw_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import pandas as pd
import numpy as np

# Define the new stages for the hiring process with numbered prefixes
stages = [
    "1. Resume Screening", "2. Initial Screening Call", "3. Preliminary Interviews",
    "4. Technical Interviews", "5. Behavioural/Panel Interviews", "6. HR/Final Round",
    "7. Offer"
]
rejection_stages = [
    "1. Resume Screening", "2. Initial Screening Call", "3. Preliminary Interviews",
    "4. Technical Interviews", "5. Behavioural/Panel Interviews", "6. HR/Final Round"
]

# Define meaningful position names and IDs
position_ids = [101, 102, 103, 104, 105, 106, 107, 108, 109, 110]
position_names = [
    "Director HR", "VP Finance", "Sales Executive", "Marketing Manager", "Product Manager",
    "Software Engineer", "Data Scientist", "Operations Manager", "Customer Support Lead", "HR Analyst"
]

# Define sources and distribution
sources = ["Employee Referral", "Internal Job Portal", "Naukri.com", "LinkedIn", "Indeed", "Campus"]
source_distribution = [0.05, 0.10, 0.25, 0.30, 0.20, 0.10]

# Generate date ranges
def generate_dates(base_date, senior_position=False):
    approval_date = base_date
    first_interaction_date = approval_date + pd.DateOffset(days=np.random.randint(1, 5))

    resume_screening_date = first_interaction_date + pd.DateOffset(days=np.random.randint(1, 5))
    initial_screening_date = resume_screening_date + pd.DateOffset(days=np.random.randint(1, 5))
    preliminary_interviews_date = initial_screening_date + pd.DateOffset(days=np.random.randint(1, 10))
    technical_interviews_date = preliminary_interviews_date + pd.DateOffset(days=np.random.randint(1, 10))
    behavioural_interviews_date = technical_interviews_date + pd.DateOffset(days=np.random.randint(1, 10))
    hr_final_round_date = behavioural_interviews_date + pd.DateOffset(days=np.random.randint(1, 5))
    offer_date = hr_final_round_date + pd.DateOffset(days=np.random.randint(1, 5))

    max_days = 120 if senior_position else 90
    if (offer_date - approval_date).days > max_days:
        offer_date = approval_date + pd.DateOffset(days=max_days)

    return (approval_date, first_interaction_date, resume_screening_date, initial_screening_date,
            preliminary_interviews_date, technical_interviews_date, behavioural_interviews_date,
            hr_final_round_date, offer_date)

# Generate sample data
np.random.seed(42)  # For reproducibility

data = []
candidate_id = 1
base_date = pd.to_datetime('2023-01-01')

for pos_id, pos_name in zip(position_ids, position_names):
    num_candidates = np.random.randint(5, 15)  # Random number of CVs (at least 5)
    senior_position = pos_id in [101, 102]  # Assuming Director HR and VP Finance are senior positions
    stages_counts = {"1. Resume Screening": 0, "2. Initial Screening Call": 0, "3. Preliminary Interviews": 0,
                     "4. Technical Interviews": 0, "5. Behavioural/Panel Interviews": 0, "6. HR/Final Round": 0,
                     "7. Offer": 0}

    for _ in range(num_candidates):
        gender = np.random.choice([0, 1])
        source = np.random.choice(sources, p=source_distribution)
        current_stage = np.random.choice(stages[:-1], p=[0.55, 0.15, 0.1, 0.05, 0.05, 0.1])

        approval_date = base_date + pd.DateOffset(days=np.random.randint(0, 181))  # Between Jan 2023 to July 2023
        dates = generate_dates(approval_date, senior_position)

        if stages_counts["7. Offer"] == 0 and np.random.rand() < 0.05:  # Ensure only one offer
            current_stage = "7. Offer"
            stages_counts[current_stage] += 1
        else:
            rejection_stages_subset = rejection_stages[:stages.index(current_stage) + 1]
            #rejection_probabilities = [0.55, 0.15, 0.1, 0.05, 0.05][:len(rejection_stages_subset)]
            #rejection_stage = np.random.choice(rejection_stages_subset, p=rejection_probabilities)
            #current_stage = rejection_stage

        data.append([pos_id, pos_name, candidate_id, gender, source, current_stage] + list(dates))
        candidate_id += 1

# Convert to DataFrame
df = pd.DataFrame(data, columns=["Position ID", "Position", "Candidate ID", "Gender", "Source of CV", "Current Stage",
                                 "Position Approval Date", "Candidate First Interaction Date", "Resume Screening Date",
                                 "Initial Screening Date", "Preliminary Interviews Date", "Technical Interviews Date",
                                 "Behavioural/Panel Interviews Date", "HR/Final Round Date", "Offer Date"])

# Save to the provided CSV file
file_path = "TA_Raw_Data_Updated_with_Stages_and_Offer_Dates.csv"
df.to_csv(file_path, index=False)
print("CSV file generated successfully: TA_Raw_Data_Updated_with_Stages_and_Offer_Dates.csv")


CSV file generated successfully: TA_Raw_Data_Updated_with_Stages_and_Offer_Dates.csv
