# Labor Condition Applications (LCAs)
## Clean up fields

In [25]:
# Import packages
import pandas as pd
import re

In [2]:
# Set up parameters
data_dir = '../../data/'
input_dir = data_dir + 'intermediate/'
input_filename = 'lca_deduped.csv'
output_dir = data_dir + 'intermediate/'
output_filename = 'lca_fields.csv'

In [3]:
# Load data
lca = pd.read_csv(input_dir + input_filename, dtype=str)

### Standardize yes/no columns

In [4]:
def print_value_counts(cols):
  for col in cols:
    print(lca[col].value_counts(dropna=False).sort_index())
    print('')

In [5]:
def standardize_yes_no(value):
  if not isinstance(value, str):
    val = value
  elif value.lower().strip() in ['y', 'yes']:
    val = 'Y'
  elif value.lower().strip() in ['n', 'no']:
    val = 'N'
  else:
    val = value
  return val

In [6]:
yes_no_cols = ['FULL_TIME_POSITION', 'H_1B_DEPENDENT', 'WILLFUL_VIOLATOR']

In [7]:
print_value_counts(yes_no_cols)

FULL_TIME_POSITION
N       123786
Y      5539865
NaN     799965
Name: count, dtype: int64

H_1B_DEPENDENT
N      1931288
No     1062941
Y      1261146
Yes     436157
NaN    1772084
Name: count, dtype: int64

WILLFUL_VIOLATOR
N      3190032
No     1498642
Y         2383
Yes        451
NaN    1772108
Name: count, dtype: int64



In [8]:
for col in yes_no_cols:
  lca[col] = lca[col].apply(standardize_yes_no)

In [9]:
print_value_counts(yes_no_cols)

FULL_TIME_POSITION
N       123786
Y      5539865
NaN     799965
Name: count, dtype: int64

H_1B_DEPENDENT
N      2994229
Y      1697303
NaN    1772084
Name: count, dtype: int64

WILLFUL_VIOLATOR
N      4688674
Y         2834
NaN    1772108
Name: count, dtype: int64



### Standardize statutory basis

In [10]:
lca['STATUTORY_BASIS'].value_counts(dropna=False)

STATUTORY_BASIS
NaN                                                                                        5845889
$60,000 or higher annual wage                                                               321608
WAGE                                                                                        134021
Both $60,000 or higher in annual wage and Masters Degree or higher in related specialty     111103
BOTH                                                                                         48856
Masters Degree or higher in related specialty                                                 1212
DEGREE                                                                                         927
Name: count, dtype: int64

In [11]:
def standardize_statutory_basis(value):
  if pd.isnull(value):
    val = pd.NA
  elif value == '$60,000 or higher annual wage':
    val = 'WAGE'
  elif value == 'Masters Degree or higher in related specialty':
    val = 'DEGREE'
  elif value == 'Both $60,000 or higher in annual wage and Masters Degree or higher in related specialty':
    val = 'BOTH'
  else:
    val = value
  return val

In [12]:
lca['STATUTORY_BASIS'] = lca['STATUTORY_BASIS'].apply(standardize_statutory_basis)

In [13]:
lca['STATUTORY_BASIS'].value_counts(dropna=False)

STATUTORY_BASIS
<NA>      5845889
WAGE       455629
BOTH       159959
DEGREE       2139
Name: count, dtype: int64

In [14]:
def standardize_pw_level(value):
  if pd.isnull(value):
    val = pd.NA
  elif value.lower().strip() in ['i', 'level i']:
    val = 'Level I'
  elif value.lower().strip() in ['ii', 'level ii']:
    val = 'Level II'
  elif value.lower().strip() in ['iii', 'level iii']:
    val = 'Level III'
  elif value.lower().strip() in ['iv', 'level iv']:
    val = 'Level IV'
  else:
    val = pd.NA
  return val

In [15]:
lca['PW_WAGE_LEVEL'] = lca['PW_WAGE_LEVEL'].apply(standardize_pw_level)

In [16]:
lca['PW_WAGE_LEVEL'].value_counts(dropna=False)

PW_WAGE_LEVEL
<NA>         2502904
Level II     1840872
Level I       863027
Level III     760697
Level IV      496116
Name: count, dtype: int64

### Wage fields

#### Standardize prevailing wage level

In [17]:
lca['PW_WAGE_LEVEL'].value_counts(dropna=False)

PW_WAGE_LEVEL
<NA>         2502904
Level II     1840872
Level I       863027
Level III     760697
Level IV      496116
Name: count, dtype: int64

There are a handful of records that specify a prevailing wage level of "V", which doesn't exist. There are only 4 wage levels.  
Set these instances to null.

#### Create an annualized prevailing wage field
Prevailing wages and wage ranges can be reported at different rates (yearly, hourly, etc).  Create a new field for the annualized pay rate so cases can be compared.


In [18]:
# Set the prevailing wage field to type float
lca['PREVAILING_WAGE'] = lca['PREVAILING_WAGE'].astype(float)

In [19]:
lca['PW_UNIT_OF_PAY'].value_counts(dropna=False)

PW_UNIT_OF_PAY
Year         6013133
Hour          437992
Month           5087
NaN             4803
Week            1880
Bi-Weekly        721
Name: count, dtype: int64

In [20]:
def annualize_wages(row, wage_col):
  unit = row['PW_UNIT_OF_PAY']
  pw = row[wage_col]

  if pd.isna(unit) or pd.isna(pw):
    return pd.NA

  unit = unit.lower().strip()

  if unit == 'year':
    pw_stz = pw
  elif unit == 'month':
    pw_stz = pw * 12
  elif unit == 'bi-weekly':
    pw_stz = pw * 26
  elif unit == 'week':
    pw_stz = pw * 52
  elif unit == 'hour':
    pw_stz = pw * 40 * 52
  else:
    pw_stz = pd.NA
  
  return round(pw_stz)

In [21]:
lca['PW_ANNUAL'] = lca.apply(lambda row: annualize_wages(row, 'PREVAILING_WAGE'), axis=1)

Check a handful of each wage rate to see if the converstion was successful

In [22]:
lca[['PW_UNIT_OF_PAY', 'PREVAILING_WAGE', 'PW_ANNUAL']].sort_values('PW_UNIT_OF_PAY').groupby('PW_UNIT_OF_PAY', dropna=False).head(3)

Unnamed: 0,PW_UNIT_OF_PAY,PREVAILING_WAGE,PW_ANNUAL
6394985,Bi-Weekly,3807.2,98987.0
2718570,Bi-Weekly,1635.2,42515.0
2419020,Bi-Weekly,3597.0,93522.0
5693262,Hour,31.49,65499.0
3089483,Hour,21.8,45344.0
321123,Hour,24.46,50877.0
5555643,Month,4544.79,54537.0
1465860,Month,3657.0,43884.0
1105076,Month,10439.84,125278.0
17796,Week,768.47,39960.0


#### Clean up wage range fields

Manually inspecting the "FROM" field shows that there are a few main types of entries:
 - A single number, either as a whole number or decimal: "60000" or "45.60"
 - A range: "102500 - 122900" or "95594.00 - 99639.87"
 - A range missing an upper bound: "95000 -" or "135000.00 -"

The "TO" field seems to have only single numbers, no ranges.

We'll create two new fields for the lower and upper wage values.

For the lower bound:
 - Split the WAGE_RATE_OF_PAY_FROM field on "-"
 - Set the wage lower bound field using the first value of the split
 - Save the second value of the split to fill in the the upper bound field

For the upper bound:
 - If WAGE_RATE_OF_PAY_TO is not missing, set the upper bound field to this value
 - If WAGE_RATE_OF_PAY_TO is missing, set the upper bound to the second value of WAGE_RATE_OF_PAY_FROM, if it exists.

First, clean up the "FROM" field by removing any characters that are not a number, period, or dash.

In [23]:
def remove_non_numeric_chars(s):
  if not isinstance(s, str):
    return s
  return re.sub(r'[^0-9\.-]', '', s)

In [26]:
lca['WAGE_RATE_OF_PAY_FROM'] = lca['WAGE_RATE_OF_PAY_FROM'].apply(remove_non_numeric_chars)

Split the "FROM" wage field

In [27]:
wage_ranges = lca['WAGE_RATE_OF_PAY_FROM'].str.split('-', expand=True)
wage_ranges.columns = ['WAGE_SPLIT_1', 'WAGE_SPLIT_2']

In [28]:
lca = lca.join(wage_ranges)

Set the new wage columns

In [29]:
lca['WAGE_FROM'] = lca['WAGE_SPLIT_1'].replace({pd.NA: None, '': None}).astype(float)

In [30]:
lca['WAGE_TO'] = lca['WAGE_RATE_OF_PAY_TO'].combine_first(lca['WAGE_SPLIT_2']).replace({pd.NA: None, '': None}).astype(float)

Inspect the new columns

In [31]:
lca.loc[:, ['WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_SPLIT_1', 'WAGE_SPLIT_2', 'WAGE_FROM', 'WAGE_TO']]

Unnamed: 0,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_SPLIT_1,WAGE_SPLIT_2,WAGE_FROM,WAGE_TO
0,82300,125000,82300,,82300.0,125000.0
1,57000,,57000,,57000.0,
2,34695,,34695,,34695.0,
3,44408,,44408,,44408.0,
4,63180,,63180,,63180.0,
...,...,...,...,...,...,...
6463611,350000.0,450000.0,350000.0,,350000.0,450000.0
6463612,92050.0,,92050.0,,92050.0,
6463613,125000.0,,125000.0,,125000.0,
6463614,92650.0,,92650.0,,92650.0,


In [32]:
# Check instances when the original "TO" field is not null
# The new WAGE_UPPER field should equal the old WAGE_RATE_OF_PAY_TO field
lca.loc[(~lca['WAGE_RATE_OF_PAY_TO'].isna()) & (lca['WAGE_SPLIT_2'].isna()), ['WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_SPLIT_1', 'WAGE_SPLIT_2', 'WAGE_FROM', 'WAGE_TO']]

Unnamed: 0,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_SPLIT_1,WAGE_SPLIT_2,WAGE_FROM,WAGE_TO
0,82300,125000,82300,,82300.0,125000.0
5,110000,110000,110000,,110000.0,110000.0
15,60950,80000,60950,,60950.0,80000.0
21,62000,90000,62000,,62000.0,90000.0
34,55000,65000,55000,,55000.0,65000.0
...,...,...,...,...,...,...
6463607,126173.0,132088.0,126173.0,,126173.0,132088.0
6463609,40.0,41.0,40.0,,40.0,41.0
6463610,98904.0,100000.0,98904.0,,98904.0,100000.0
6463611,350000.0,450000.0,350000.0,,350000.0,450000.0


In [33]:
# Check instances when the original WAGE_RATE_OF_PAY_TO field is null but the second element of the wage split is not
# The new wage field should equal the second elemnt of the wage split field
lca.loc[(lca['WAGE_RATE_OF_PAY_TO'].isna()) & (~lca['WAGE_SPLIT_2'].isna()), ['WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_SPLIT_1', 'WAGE_SPLIT_2', 'WAGE_FROM', 'WAGE_TO']]

Unnamed: 0,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_SPLIT_1,WAGE_SPLIT_2,WAGE_FROM,WAGE_TO
1668528,85000.00-,,85000.00,,85000.0,
1668529,66000-70000,,66000,70000,66000.0,70000.0
1668530,97000-143300,,97000,143300,97000.0,143300.0
1668531,155000.00-,,155000.00,,155000.0,
1668532,64100-127000,,64100,127000,64100.0,127000.0
...,...,...,...,...,...,...
6187093,60000-,,60000,,60000.0,
6187094,70000-,,70000,,70000.0,
6187095,69000.00-,,69000.00,,69000.0,
6187096,60000.00-,,60000.00,,60000.0,


In [34]:
# Check instances where both are null
# The new wage field should also be null
lca.loc[(lca['WAGE_RATE_OF_PAY_TO'].isna()) & (lca['WAGE_SPLIT_2'].isna()), ['WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_SPLIT_1', 'WAGE_SPLIT_2', 'WAGE_FROM', 'WAGE_TO']]

Unnamed: 0,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_SPLIT_1,WAGE_SPLIT_2,WAGE_FROM,WAGE_TO
1,57000,,57000,,57000.0,
2,34695,,34695,,34695.0,
3,44408,,44408,,44408.0,
4,63180,,63180,,63180.0,
6,42204,,42204,,42204.0,
...,...,...,...,...,...,...
6463605,117500.0,,117500.0,,117500.0,
6463608,345000.0,,345000.0,,345000.0,
6463612,92050.0,,92050.0,,92050.0,
6463613,125000.0,,125000.0,,125000.0,


Annualize the new wage fields

In [35]:
lca['WAGE_ANNUAL_FROM'] = lca.apply(lambda row: annualize_wages(row, 'WAGE_FROM'), axis=1)
lca['WAGE_ANNUAL_TO'] = lca.apply(lambda row: annualize_wages(row, 'WAGE_TO'), axis=1)

Check a handful of each wage rate to see if the converstion was successful

In [36]:
lca[['PW_UNIT_OF_PAY', 'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_ANNUAL_FROM', 'WAGE_ANNUAL_TO']].sort_values('PW_UNIT_OF_PAY').groupby('PW_UNIT_OF_PAY', dropna=False).head(3)

Unnamed: 0,PW_UNIT_OF_PAY,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_ANNUAL_FROM,WAGE_ANNUAL_TO
6394985,Bi-Weekly,4230.78,,110000.0,
2718570,Bi-Weekly,1635.2,0.0,42515.0,0.0
2419020,Bi-Weekly,3597.0,4526.0,93522.0,117676.0
5693262,Hour,34.61,,71989.0,
3089483,Hour,21.8,0.0,45344.0,0.0
321123,Hour,24.46,28.0,50877.0,58240.0
5555643,Month,5499.0,5500.0,65988.0,66000.0
1465860,Month,3750.0,,45000.0,
1105076,Month,10439.84,12041.67,125278.0,144500.0
17796,Week,800.0,,41600.0,


#### Fix cases with unusually high wages
There are some cases where the annualized wages are unusually high. To investigate this issue we created a wage ratio metric, calculated as the ratio between the the intended wage and the prevailing wage. Diving into cases with the highest wage ratios, it looks like there are a few different things going on:
 - In many cases the prevailing wage is provided at an hourly rate but the intended wage appears to be an annual rate. A spike in the wage ratio distribution around 2080 - which is the multiplier used to convert hourly wages to annual (52*40) - appears to support this idea.
 - In some cases there appears to be a missing decimal point in the intended wage. For instance, the prevailing wage is listed as 60,000 but the intended wage is 6,000,000 or 600,000. A missing decimal point seems more likely than an employer paying 100x the prevailing wage.
 - In one case (`I-200-12080-308552`) there seems to be a missing dash. The intended wage is "4500055000" but should likely be "45000-55000", which would be comparable to the prevailing wage for the occupation.
 - In other cases, it's not entirely clear why the intended wage is orders of magnitude higher than the prevailing wage.

Overall, there are relatively few of these cases. Of a total of 6.5M cases, ~5k cases have a wage ratio greater than 10. Most of these (~3k) seem to stem from listing an hourly prevailing wage and an annual intended wage. We'll attempt to correct these cases.

It's more difficult to determine the root cause of the remaining cases so we'll leave them be for now. Any wage analysis will have to keep this potential data issue in mind (e.g. report on percentiles or medians instead of averages).

Here's what we'll do to fix the hourly/annual issue:
 1. Flag cases that list an hourly prevailing wage and have a wage ratio greater than 2000.
 2. Set the annualized wages to the intended wage provided on the LCA. In other words, we'll revert the wage annualization for these cases.

In [37]:
# Subset of columns to use when investigating this issue
wage_cols = ['EMPLOYER_NAME', 'JOB_TITLE', 'SOC_CODE', 'SOC_TITLE', 'PW_UNIT_OF_PAY', 'PREVAILING_WAGE', 'PW_ANNUAL', 'WAGE_RATE_OF_PAY_FROM', 'WAGE_RATE_OF_PAY_TO', 'WAGE_SPLIT_1', 'WAGE_SPLIT_2', 'WAGE_ANNUAL_FROM', 'WAGE_ANNUAL_TO', 'WAGE_RATIO']

In [38]:
# Calculate the ratio of the given wage to the prevailing wage
# In theory these should be comparable. High ratios signal that something strange is going on.
lca['WAGE_RATIO'] = lca['WAGE_SPLIT_1'].astype(float) / lca['PREVAILING_WAGE'].astype(float)

In [39]:
wage_ratio_threshold = 2000
wage_fix_filter = (lca['PW_UNIT_OF_PAY'] == 'Hour') & (lca['WAGE_RATIO'] > wage_ratio_threshold)

In [40]:
print('A wage ratio threshold of ' + str(wage_ratio_threshold) + ' will update ' + str(lca.loc[wage_fix_filter].shape[0]) + ' records.')


A wage ratio threshold of 2000 will update 3156 records.


In [41]:
# Apply the wage fix
lca.loc[wage_fix_filter, ['WAGE_ANNUAL_FROM', 'WAGE_ANNUAL_TO']] = lca.loc[wage_fix_filter, ['WAGE_SPLIT_1', 'WAGE_SPLIT_2']].values

## Save file

In [52]:
# Save analysis file
lca.to_csv(output_dir + output_filename, index=False)

### SOC codes and titles
_IN PROGRESS_

In [8]:
# Load columns for cleaning up SOC codes and titles
soc_cols = ['CASE_NUMBER', 'DATAFILE_YEAR', 'DATAFILE_QUARTER', 'EMPLOYER_NAME', 'SOC_CODE', 'SOC_TITLE']

In [9]:
# Load file
lca = pd.read_csv(output_dir + output_filename, usecols=soc_cols, index_col='CASE_NUMBER')

In [10]:
lca.dtypes

EMPLOYER_NAME        object
SOC_CODE             object
SOC_TITLE            object
DATAFILE_YEAR         int64
DATAFILE_QUARTER    float64
dtype: object

In [11]:
lca.head(50)

Unnamed: 0_level_0,EMPLOYER_NAME,SOC_CODE,SOC_TITLE,DATAFILE_YEAR,DATAFILE_QUARTER
CASE_NUMBER,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
I-200-20259-823768,\t1. UBS BUSINESS SOLUTIONS US LLC,15-1133.00,"Software Developers, Systems Software",2020,4.0
I-200-22238-441074,\tAimbridge Employee Service Corp.,11-3131.00,Training and Development Managers,2022,4.0
I-200-21295-662005,"\tAlliance for Sustainable Energy, LLC",19-2041.00,"Environmental Scientists and Specialists, Incl...",2022,1.0
I-200-21305-680234,"\tArtemis Consulting, Inc.",13-1161.00,Market Research Analysts and Marketing Special...,2022,1.0
I-200-22010-819571,\tAsclepius Solutions Inc.,15-1132.00,"Software Developers, Applications",2022,2.0
I-200-23087-884020,"\tAttainX, Inc.",15-2099.01,Bioinformatics Technicians,2023,3.0
I-200-19343-187664,"\tAuto TechLabs, Inc.",15-1141.00,Database Administrators,2020,1.0
I-200-20058-356819,"\tAuto TechLabs, Inc.",15-1132.00,"Software Developers, Applications",2020,2.0
I-200-20084-432378,"\tAuto TechLabs, Inc.",15-1132.00,"Software Developers, Applications",2020,2.0
I-200-20238-786589,"\tAuto TechLabs, Inc.",15-1132.00,"Software Developers, Applications",2020,4.0


In [12]:
lca['SOC_CODE'].value_counts(dropna=False)

SOC_CODE
15-1132                      658914
15-1132.00                   474036
15-1121                      440080
Computer Systems Analysts    287370
15-1199                      267292
                              ...  
Carpet Installers                 1
Camera Operators                  1
39-1011                           1
35-3021                           1
47-2181.00                        1
Name: count, Length: 2719, dtype: int64

In [13]:
lca['SOC_TITLE'].value_counts(dropna=False)

SOC_TITLE
SOFTWARE DEVELOPERS, APPLICATIONS    653452
Software Developers, Applications    480214
COMPUTER SYSTEMS ANALYSTS            352868
COMPUTER OCCUPATIONS, ALL OTHER      247898
COMPUTER PROGRAMMERS                 241332
                                      ...  
SENIOR DEVELOPER-BIOINFORMATICS           1
POST-DOCTORATE CHEMIST                    1
CRM SIEBEL ARCHITECT                      1
ANDROID PROGRAMMER                        1
EMBEDDED LINUX SOFTWARE ENGINEER          1
Name: count, Length: 206735, dtype: int64

In [14]:
lca[['SOC_CODE', 'SOC_TITLE']].value_counts(dropna=False)

SOC_CODE                          SOC_TITLE                           
15-1132                           SOFTWARE DEVELOPERS, APPLICATIONS       652563
15-1132.00                        Software Developers, Applications       473608
15-1121                           COMPUTER SYSTEMS ANALYSTS               348753
15-1199                           COMPUTER OCCUPATIONS, ALL OTHER         247819
15-1131                           COMPUTER PROGRAMMERS                    237225
                                                                           ...  
Electrical Engineers              DIRECTOR OF ENGINEERING SERVICES             1
                                  DIRECTOR OF HARDWARE ENGINEERING             1
                                  DIRECTOR OF PRODUCT DEVELOPMENT              1
                                  DIRECTOR OF PRODUCT ENGINEERING              1
Financial Specialists, All Other  ASSOCIATE, RISK & PORTFOLIO ANALYSIS         1
Name: count, Length: 291202, dtype: in

In [15]:
lca[['DATAFILE_YEAR', 'SOC_CODE', 'SOC_TITLE']].value_counts(dropna=False)

DATAFILE_YEAR  SOC_CODE              SOC_TITLE                                
2019           15-1132               SOFTWARE DEVELOPERS, APPLICATIONS            193987
2020           15-1132.00            Software Developers, Applications            170826
2023           15-1252.00            Software Developers                          170457
2018           15-1132               SOFTWARE DEVELOPERS, APPLICATIONS            169491
2021           15-1132.00            Software Developers, Applications            151504
                                                                                   ...  
2012           Animal Scientists     LATIN AMERICA EMBRYO TRANSFER COORDINATOR         1
                                     LABORATORY RESEARCH MANAGER                       1
                                     HORSE CONSULTANT (ANIMAL SCIENTIST)               1
                                     HERD HEALTH MANAGER/ANIMAL SCIENTIST              1
               Mechanical Engin