In [None]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta
import random

In [None]:
np.random.seed(42)

In [None]:
df = pd.read_csv('hiring_data_unedited.csv')
df.head()

Unnamed: 0,ID,FP,BU Region,Approved,On hold,Sourcing start,Interview start,Interview end,Offered,Filled,Status
0,2304,F,96,2014-06-25,,2014-07-02 00:00:00,2014-07-18 00:00:00,2014-08-06 00:00:00,2014-08-14 00:00:00,2014-08-23 00:00:00,Filled
1,14263,F,10,2014-06-24,,2014-06-30 00:00:00,2014-07-12 00:00:00,2014-08-07 00:00:00,2014-08-12 00:00:00,2014-08-21 00:00:00,Filled
2,5394,P,7,2014-07-30,,2014-08-07 00:00:00,2014-08-19 00:00:00,,,,Interview start
3,16251,F,8,2014-07-27,,2014-08-02 00:00:00,2014-08-21 00:00:00,,,,Interview start
4,1537,P,22,2014-08-21,,2014-08-28 00:00:00,,,,,Sourcing start


In [None]:
df.sample(10)

Unnamed: 0,ID,FP,BU Region,Approved,On hold,Sourcing start,Interview start,Interview end,Offered,Filled,Status
151,19040,F,24,2014-07-25,,2014-08-01 00:00:00,2014-08-17 00:00:00,,,,Interview start
807,16302,P,98,2014-08-21,,2014-08-28 00:00:00,,,,,Sourcing start
621,18011,P,20,2014-08-01,,2014-08-07 00:00:00,2014-08-17 00:00:00,,,,Interview start
3978,11384,F,7,2014-08-21,,2014-08-27 00:00:00,,,,,Sourcing start
3998,1417,P,7,2014-08-09,,2014-08-14 00:00:00,2014-08-29 00:00:00,,,,Interview start
1747,12495,P,3,2014-06-03,,2014-06-08 00:00:00,2014-06-22 00:00:00,2014-07-21 00:00:00,2014-07-30 00:00:00,2014-08-05 00:00:00,Filled
555,9350,P,3,2014-06-28,,2014-07-04 00:00:00,2014-07-16 00:00:00,2014-08-05 00:00:00,2014-08-12 00:00:00,2014-08-20 00:00:00,Filled
4545,18578,P,99,2014-08-29,,,,,,,Approved
1536,11179,P,17,2014-08-17,,2014-08-24 00:00:00,,,,,Sourcing start
4111,4080,P,24,2014-03-17,2014-04-04 00:00:00,,,,,,On hold


In [None]:
df.size

54417

In [None]:
df['Role Type'] = np.random.choice(
    ['Technical', 'Non-Technical'],  # choices
    size=len(df),                     # one for each row
    p=[0.46, 0.54]                    # probability split
)

In [None]:
# Create Recruiting_Channel column with NaN initially
df['Recruiting Channel'] = np.nan

# Mask for rows with Interview start
mask = df['Interview start'].notna()

In [None]:
# Assign weighted random recruiting channels only where Interview start is not NaN
df.loc[mask, 'Recruiting Channel'] = np.random.choice(
    ['Referral', 'Internal', 'LinkedIn', 'Direct'],
    size=mask.sum(),
    p=[0.25, 0.20, 0.45, 0.10]
)

  df.loc[mask, 'Recruiting Channel'] = np.random.choice(


In [None]:
# Create Offer_Bonus column with NaN initially
df['Offer Bonus'] = np.nan

# Mask for rows where 'Offered' is not NaN
mask = df['Offered'].notna()

# Generate bonus for each applicable row
bonuses = []
for idx, row in df.loc[mask].iterrows():
    if row['Role Type'] == 'Technical':
        bonus_range = np.arange(12000, 30001, 500)  # higher range
    else:
        bonus_range = np.arange(5000, 25001, 500)   # lower range

    bonuses.append(np.random.choice(bonus_range))

# Assign back to dataframe
df.loc[mask, 'Offer Bonus'] = bonuses

In [None]:
# State/City list
locations = [
    ("New York", "NY"),
    ("San Francisco", "CA"),
    ("Austin", "TX"),
    ("Chicago", "IL"),
    ("Seattle", "WA"),
    ("Boston", "MA"),
    ("Atlanta", "GA"),
    ("Denver", "CO"),
    ("Miami", "FL"),
    ("Dallas", "TX")
]


# Randomly choose locations
chosen_locations = np.random.choice(len(locations), size=len(df))

# Assign city and state
df['City'] = [locations[i][0] for i in chosen_locations]
df['State'] = [locations[i][1] for i in chosen_locations]

In [None]:
df.sample(5)

Unnamed: 0,ID,FP,BU Region,Approved,On hold,Sourcing start,Interview start,Interview end,Offered,Filled,Status,Role Type,Recruiting Channel,Offer Bonus,City,State
1937,19122,F,2,2014-05-28,,2014-06-06 00:00:00,2014-06-23 00:00:00,2014-07-15 00:00:00,2014-07-24 00:00:00,2014-08-02 00:00:00,Filled,Non-Technical,Referral,9000.0,San Francisco,CA
3149,9109,P,22,2014-08-26,,,,,,,Approved,Technical,,,New York,NY
2857,14057,P,1,2014-08-07,,2014-08-13 00:00:00,,,,,Sourcing start,Technical,,,San Francisco,CA
333,16124,P,3,2014-07-07,,2014-07-15 00:00:00,2014-07-26 00:00:00,2014-08-20 00:00:00,2014-08-27 00:00:00,,Offered,Technical,Internal,16000.0,New York,NY
3931,7665,F,8,2014-07-24,,2014-07-31 00:00:00,2014-08-19 00:00:00,,,,Interview start,Technical,Direct,,Denver,CO


In [None]:
df = df.drop(columns=["BU Region"]) # drop unused column

In [None]:
df.sample(5)

Unnamed: 0,ID,FP,Approved,On hold,Sourcing start,Interview start,Interview end,Offered,Filled,Status,Role Type,Recruiting Channel,Offer Bonus,City,State
392,1010,P,2014-06-16,,2014-06-21 00:00:00,2014-07-07 00:00:00,2014-07-30 00:00:00,2014-08-04 00:00:00,2014-08-12 00:00:00,Filled,Non-Technical,Direct,18000.0,Atlanta,GA
495,1759,P,2014-07-03,,2014-07-10 00:00:00,2014-07-24 00:00:00,2014-08-15 00:00:00,2014-08-23 00:00:00,2014-08-28 00:00:00,Filled,Non-Technical,Internal,8500.0,New York,NY
2422,8977,F,2014-07-16,,2014-07-21 00:00:00,2014-08-04 00:00:00,,,,Interview start,Technical,LinkedIn,,New York,NY
3800,8560,F,2014-08-25,,2014-08-31 00:00:00,,,,,Sourcing start,Non-Technical,,,Dallas,TX
3352,3985,F,2014-08-27,,,,,,,Approved,Non-Technical,,,Atlanta,GA


In [None]:
df.loc[df['FP'].str.lower() == 'p', 'Offer Bonus'] = np.nan # add null value for part-time roles, assuming bonuses not given for part-time roles

In [None]:
df.sample(5)

Unnamed: 0,ID,FP,Approved,On hold,Sourcing start,Interview start,Interview end,Offered,Filled,Status,Role Type,Recruiting Channel,Offer Bonus,City,State
1388,17956,F,2014-08-24,,,,,,,Approved,Non-Technical,,,Miami,FL
1378,2542,F,2014-07-13,,2014-07-19 00:00:00,2014-08-02 00:00:00,2014-08-24 00:00:00,2014-08-29 00:00:00,,Offered,Technical,LinkedIn,15500.0,Dallas,TX
4416,16230,F,2014-06-23,,2014-06-28 00:00:00,2014-07-09 00:00:00,2014-08-05 00:00:00,2014-08-11 00:00:00,2014-08-18 00:00:00,Filled,Technical,Referral,30000.0,Atlanta,GA
3557,2680,P,2014-06-20,,2014-06-27 00:00:00,2014-07-14 00:00:00,2014-08-02 00:00:00,2014-08-08 00:00:00,2014-08-14 00:00:00,Filled,Non-Technical,Internal,,Denver,CO
2105,6091,P,2014-07-31,,2014-08-09 00:00:00,2014-08-26 00:00:00,,,,Interview start,Non-Technical,LinkedIn,,Seattle,WA


In [None]:
df.to_csv("hiring_clean.csv", index=False)