In [1]:
import pandas as pd

In [2]:
ds_jobs = pd.read_csv("customer_train.csv")
ds_jobs_transformed = ds_jobs.copy()

In [3]:
ds_jobs_transformed

Unnamed: 0,student_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,job_change
0,8949,city_103,0.920,Male,Has relevant experience,no_enrollment,Graduate,STEM,>20,,,1,36,1
1,29725,city_40,0.776,Male,No relevant experience,no_enrollment,Graduate,STEM,15,50-99,Pvt Ltd,>4,47,0
2,11561,city_21,0.624,,No relevant experience,Full time course,Graduate,STEM,5,,,never,83,0
3,33241,city_115,0.789,,No relevant experience,,Graduate,Business Degree,<1,,Pvt Ltd,never,52,1
4,666,city_162,0.767,Male,Has relevant experience,no_enrollment,Masters,STEM,>20,50-99,Funded Startup,4,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6661,4382,city_21,0.624,Male,Has relevant experience,no_enrollment,Graduate,STEM,5,,,1,180,1
6662,26546,city_45,0.890,Male,Has relevant experience,,Graduate,STEM,>20,1000-4999,Pvt Ltd,3,34,0
6663,31332,city_103,0.920,Female,Has relevant experience,no_enrollment,Graduate,STEM,5,50-99,Funded Startup,1,31,0
6664,5716,city_71,0.884,Male,Has relevant experience,no_enrollment,Masters,STEM,11,100-499,Funded Startup,1,167,0


In [4]:
# EDA to help identify ordinal, nominal, and two-factor categories
for col in ds_jobs.select_dtypes("object").columns:
    print(ds_jobs_transformed[col].value_counts(), '\n')

city
city_103    1514
city_21      906
city_16      517
city_114     464
city_160     294
            ... 
city_120       1
city_111       1
city_81        1
city_140       1
city_18        1
Name: count, Length: 115, dtype: int64 

gender
Male      4660
Female     433
Other       72
Name: count, dtype: int64 

relevant_experience
Has relevant experience    4789
No relevant experience     1877
Name: count, dtype: int64 

enrolled_university
no_enrollment       4776
Full time course    1349
Part time course     408
Name: count, dtype: int64 

education_level
Graduate          4043
Masters           1494
High School        719
Phd                152
Primary School     108
Name: count, dtype: int64 

major_discipline
STEM               5045
Humanities          228
Other               133
Business Degree     110
Arts                 93
No Major             73
Name: count, dtype: int64 

experience
>20    1123
5       514
4       485
3       480
6       416
2       413
7       385
9       3

In [5]:
# Create a dictionary of columns containing ordered categorical data
ordered_cats = {
    'enrolled_university': ['no_enrollment', 'Part time course', 'Full time course'],
    'education_level': ['Primary School', 'High School', 'Graduate', 'Masters', 'Phd'],
    'experience': ['<1'] + list(map(str, range(1, 21))) + ['>20'],
    'company_size': ['<10', '10-49', '50-99', '100-499', '500-999', '1000-4999', '5000-9999', '10000+'],
    'last_new_job': ['never', '1', '2', '3', '4', '>4']
}

In [6]:
ordered_cats

{'enrolled_university': ['no_enrollment',
  'Part time course',
  'Full time course'],
 'education_level': ['Primary School',
  'High School',
  'Graduate',
  'Masters',
  'Phd'],
 'experience': ['<1',
  '1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '>20'],
 'company_size': ['<10',
  '10-49',
  '50-99',
  '100-499',
  '500-999',
  '1000-4999',
  '5000-9999',
  '10000+'],
 'last_new_job': ['never', '1', '2', '3', '4', '>4']}

In [7]:
# Create a mapping dictionary of columns containing two-factor categories to convert to Booleans
two_factor_cats = {
    'relevant_experience': {'No relevant experience': False, 'Has relevant experience': True},
    'job_change': {0.0: False, 1.0: True}
}


In [8]:
two_factor_cats

{'relevant_experience': {'No relevant experience': False,
  'Has relevant experience': True},
 'job_change': {0.0: False, 1.0: True}}

In [9]:
# Loop through DataFrame columns to efficiently change data types
for col in ds_jobs_transformed:
    
    # Convert two-factor categories to bool
    if col in ['relevant_experience', 'job_change']:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].map(two_factor_cats[col])
    
    # Convert integer columns to int32
    elif col in ['student_id', 'training_hours']:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('int32')
    
    # Convert float columns to float16
    elif col == 'city_development_index':
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('float16')
    
    # Convert columns containing ordered categorical data to ordered categories using dict
    elif col in ordered_cats.keys():
        category = pd.CategoricalDtype(ordered_cats[col], ordered=True)
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype(category)
    
    # Convert remaining columns to standard categories
    else:
        ds_jobs_transformed[col] = ds_jobs_transformed[col].astype('category')

In [10]:
# Filter students with 10 or more years experience at companies with at least 1000 employees
ds_jobs_transformed = ds_jobs_transformed[(ds_jobs_transformed['experience'] >= '10') & 
                                          (ds_jobs_transformed['company_size'] >= '1000-4999')]

In [12]:
ds_jobs_transformed

Unnamed: 0,student_id,city,city_development_index,gender,relevant_experience,enrolled_university,education_level,major_discipline,experience,company_size,company_type,last_new_job,training_hours,job_change
9,699,city_103,0.919922,,True,no_enrollment,Graduate,STEM,17,10000+,Pvt Ltd,>4,123,False
12,25619,city_61,0.913086,Male,True,no_enrollment,Graduate,STEM,>20,1000-4999,Pvt Ltd,3,23,False
31,22293,city_103,0.919922,Male,True,Part time course,Graduate,STEM,19,5000-9999,Pvt Ltd,>4,141,False
34,26494,city_16,0.910156,Male,True,no_enrollment,Graduate,Business Degree,12,5000-9999,Pvt Ltd,3,145,False
40,2547,city_114,0.925781,Female,True,Full time course,Masters,STEM,16,1000-4999,Public Sector,2,14,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6639,84,city_103,0.919922,Other,True,no_enrollment,Graduate,STEM,19,10000+,Pvt Ltd,>4,92,False
6651,30106,city_103,0.919922,Male,True,no_enrollment,Graduate,STEM,13,5000-9999,Pvt Ltd,>4,12,False
6652,12652,city_104,0.923828,Male,True,no_enrollment,Graduate,STEM,11,1000-4999,Pvt Ltd,never,33,False
6659,8716,city_103,0.919922,Male,True,no_enrollment,Masters,STEM,12,10000+,Pvt Ltd,1,77,False
