In [1]:
# code to find inconsistencies in the timelines of resume dates
# csv data example format:
# Resume ID,Company,Dates
# 1,XYZ,['2010 to 2014', '2007 to 2010', '2002 to 2007']

In [137]:
import csv
import re
import pandas as pd 

In [149]:
experience_data = pd.read_csv('companies.csv')
print(experience_data.shape)
experience_data.columns
experience_data.dtypes

(378, 3)


Resume ID     int64
Company      object
Dates        object
dtype: object

In [150]:
experience_data['Dates'].fillna('[]', inplace=True)
experience_data['Dates'] = experience_data['Dates'].str.replace('[','').str.replace(']','').str.replace(" ", '').str.replace("'",'').str.split(',')

# if date is single year, change to 'year to 2024'
experience_data['Dates'] = experience_data['Dates'].apply(lambda x: [y if 'to' in y else y + 'to2024' for y in x])

# descending sort
experience_data['Dates'] = experience_data['Dates'].apply(lambda x: sorted(x, reverse=True))

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  experience_data['Dates'].fillna('[]', inplace=True)


In [151]:
experience_data.columns
experience_data.dtypes

Resume ID     int64
Company      object
Dates        object
dtype: object

In [152]:
experience_data['total_gap_years'] = 0
experience_data['gap_greater_equal_2_years'] = False

In [153]:
def check_dates(pos, candidate_experience):
    start_year = None
    total_gap_years = 0
    
    start_year = int(candidate_experience[0][:4])

    for i in range(1, len(candidate_experience)):
        end_year = int(candidate_experience[i][-4:])
        gap = start_year - end_year
        
        if gap > 2:
            total_gap_years += gap
        
        start_year = int(candidate_experience[i+1][:4]) if i + 1 < len(candidate_experience) else start_year
    
    experience_data.loc[pos, 'total_gap_years'] = total_gap_years
    experience_data.loc[pos, 'gap_greater_equal_2_years'] = total_gap_years >= 2

for i in range(len(experience_data)):
    if experience_data['Dates'][i] == ['to2024']:
        continue
    check_dates(i, experience_data['Dates'][i])

In [156]:
experience_data['total_gap_years'].value_counts()
experience_data['gap_greater_equal_2_years'].value_counts()

gap_greater_equal_2_years
False    369
True       9
Name: count, dtype: int64

In [155]:
experience_data.to_csv('gap_years.csv', index=False)