### H1B Data Exploration and Cleanup
______

#### Download Dataset

Run

```shell
chmod a+x download.sh
./download.sh
```
---------

#### Combine quarterly data into yearly data
Convert Excel to CSV since pandas read csv faster and this takes approximately one hour

In [10]:
import glob
import pandas as pd
years = ['2017', '2018', '2019', '2020', '2021', '2022']

print('Combining and Transforming excel file to csv')
for year in years:
    print(f'Starting Year {year}')
    file_list = glob.glob('data/' + year + '/*.xlsx')
    excel_list=[]
    num = len(file_list)
    for i in range(num):
        print(f'Reading File {i+1} / {num}')
        excel_list.append(pd.read_excel(file_list[i]))
    year_df = pd.DataFrame()
    for i in range(num):
        print(f'Combining File {i+1} / {num}')
        year_df = year_df.append(excel_list[i], ignore_index = True)
    print('Transforming File')
    year_df.to_csv ('data/' + year + '/raw.csv', index = None, header=True)
    print(f'File for Year {year} saved')

Combining and Transforming excel file to csv
Starting Year 2017
Reading File 1 / 1
Combining File 1 / 1


  year_df = year_df.append(excel_list[i], ignore_index = True)


Transforming File
File for Year 2017 saved
Starting Year 2018
Reading File 1 / 1
Combining File 1 / 1


  year_df = year_df.append(excel_list[i], ignore_index = True)


Transforming File
File for Year 2018 saved
Starting Year 2019
Reading File 1 / 1
Combining File 1 / 1


  year_df = year_df.append(excel_list[i], ignore_index = True)


Transforming File
File for Year 2019 saved
Starting Year 2020
Reading File 1 / 4
Reading File 2 / 4
Reading File 3 / 4
Reading File 4 / 4
Combining File 1 / 4
Combining File 2 / 4


  year_df = year_df.append(excel_list[i], ignore_index = True)


Combining File 3 / 4
Combining File 4 / 4
Transforming File
File for Year 2020 saved
Starting Year 2021
Reading File 1 / 4
Reading File 2 / 4
Reading File 3 / 4
Reading File 4 / 4
Combining File 1 / 4
Combining File 2 / 4


  year_df = year_df.append(excel_list[i], ignore_index = True)


Combining File 3 / 4
Combining File 4 / 4
Transforming File
File for Year 2021 saved
Starting Year 2022
Reading File 1 / 4
Reading File 2 / 4
Reading File 3 / 4
Reading File 4 / 4
Combining File 1 / 4
Combining File 2 / 4


  year_df = year_df.append(excel_list[i], ignore_index = True)


Combining File 3 / 4
Combining File 4 / 4
Transforming File
File for Year 2022 saved


Can safely delete Excel files (Optional)

Run
```
chmod a+x delete.sh
./delete.sh
```

______

#### Select Useful Columns
Some of the columns are obviously useless

In [11]:
df2022 = pd.read_csv('data/2022/raw.csv')

  df2022 = pd.read_csv('data/2022/raw.csv')


In [12]:
df2022.head()

Unnamed: 0,CASE_NUMBER,CASE_STATUS,RECEIVED_DATE,DECISION_DATE,ORIGINAL_CERT_DATE,VISA_CLASS,JOB_TITLE,SOC_CODE,SOC_TITLE,FULL_TIME_POSITION,...,WILLFUL_VIOLATOR,SUPPORT_H1B,STATUTORY_BASIS,APPENDIX_A_ATTACHED,PUBLIC_DISCLOSURE,PREPARER_LAST_NAME,PREPARER_FIRST_NAME,PREPARER_MIDDLE_INITIAL,PREPARER_BUSINESS_NAME,PREPARER_EMAIL
0,I-200-21270-606997,Certified,2021-09-26,2021-10-01,,H-1B,APPLICATIONS SUPPORT ANALYST/ADMINISTRATOR,15-1132.00,"Software Developers, Applications",Y,...,No,,,,Disclose Business,,,,,
1,I-200-21270-606867,Certified,2021-09-26,2021-10-01,,H-1B,Designer,17-3013.00,Mechanical Drafters,Y,...,No,,,,Disclose Business and Employment,Stacey,Francyne,,Stacey Law Practice,francyne@staceylawpractice.com
2,I-200-21270-606846,Certified,2021-09-26,2021-10-01,,H-1B,Data Analyst,15-2031.00,Operations Research Analysts,Y,...,No,,,,Disclose Business,,,,,
3,I-200-21270-606842,Certified,2021-09-26,2021-10-01,,H-1B,Pharmaceutical Chemist,19-2031.00,Chemists,Y,...,No,,,,Disclose Business,JONNALAGADDA,SRINIVASA,R,"S. R. JONNALAGADDA, P.C.",Lawassociates@att.net
4,I-200-21270-606941,Certified,2021-09-26,2021-10-01,,H-1B,Senior Systems Analyst JC60,15-1121.00,Computer Systems Analysts,Y,...,No,Yes,"$60,000 or higher annual wage",,Disclose Business,,,,,


In [None]:
columns = [
    'JOB_TITLE',
    'SOC_TITLE',
    'FULL_TIME_POSITION',
    'BEGIN_DATE',
    'END_DATE',
    'EMPLOYER_NAME',
    'EMPLOYER_CITY',
    'EMPLOYER_STATE',
    'EMPLOYER_COUNTRY',
    'WAGE_RATE_OF_PAY_FROM',
    'WAGE_RATE_OF_PAY_TO',
    'WAGE_UNIT_OF_PAY',
    'PREVAILING_WAGE',
    'PW_UNIT_OF_PAY',
    'H1B_DEPENDENT'
]

Unnamed: 0,JOB_TITLE,SOC_TITLE,FULL_TIME_POSITION,BEGIN_DATE,END_DATE,CHANGE_EMPLOYER,EMPLOYER_NAME,EMPLOYER_CITY,EMPLOYER_STATE,EMPLOYER_COUNTRY,WAGE_RATE_OF_PAY_FROM,WAGE_RATE_OF_PAY_TO,WAGE_UNIT_OF_PAY,PREVAILING_WAGE,PW_UNIT_OF_PAY,H1B_DEPENDENT
0,Sr. Software Developer,"Software Developers, Applications",Y,2019-11-23,2022-11-22,0,"Experis US, Inc.",WI,WI,Sr. Software Developer,53.37,70.0,Hour,53.37,Hour,No
1,"Senior Manager I, Business Planning",Marketing Managers,Y,2020-03-01,2023-02-28,0,"SAMSUNG ELECTRONICS AMERICA, INC.",RIDGEFIELD PARK,NJ,"Senior Manager I, Business Planning",146077.63,,Year,139464.00,Year,No
2,Assistant Professor,"Computer Science Teachers, Postsecondary",Y,2019-10-21,2022-10-20,0,Stevens Institute of Technology,Hoboken,NJ,Assistant Professor,110000.00,130000.0,Year,56290.00,Year,No
3,Senior Integration Engineer,"Software Developers, Applications",Y,2020-03-01,2023-03-01,1,"DaVita, Inc.",Charlotte,NC,Senior Integration Engineer,88858.00,106038.0,Year,88858.00,Year,No
4,Scientist 2,Mathematicians,Y,2019-10-28,2022-10-27,0,"Triad National Security, LLC",Los Alamos,NM,Scientist 2,121000.00,148300.0,Year,90314.00,Year,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510938,VISITING ASSISTANT PROFESSOR,"Health Specialties Teachers, Postsecondary",Y,2022-09-19,2023-09-18,0,"University of California, San Francisco",San Francisco,CA,VISITING ASSISTANT PROFESSOR,301000.00,,Year,97707.00,Year,
510939,VISITING ASSISTANT PROFESSOR,"Health Specialties Teachers, Postsecondary",Y,2022-09-01,2023-08-31,0,"University of California, San Francisco",San Francisco,CA,VISITING ASSISTANT PROFESSOR,301000.00,,Year,97707.00,Year,
510940,Financial Analysts - KBGFJG94076-9,Financial Analysts,Y,2022-07-05,2024-07-04,0,Ernst & Young U.S. LLP,Secaucus,NJ,Financial Analysts - KBGFJG94076-9,91000.00,,Year,79628.00,Year,
510941,Financial and Investment Analyst,Financial Analysts,Y,2022-07-14,2024-07-13,0,5B USA LLC,Austin,TX,Financial and Investment Analyst,100000.00,,Year,94245.00,Year,


In [None]:
df2022['EMPLOYER_COUNTRY'] = df2022[df2022['EMPLOYER_COUNTRY'] == 'UNITED STATES OF AMERICA']

In [None]:
df_missing = df2022.isnull().sum()/df2022.shape[0]
df_missing

JOB_TITLE                     0.000000
SOC_TITLE                     0.000000
FULL_TIME_POSITION            0.000000
BEGIN_DATE                    0.000000
END_DATE                      0.000000
TOTAL_WORKER_POSITIONS        0.000000
NEW_EMPLOYMENT                0.000000
CONTINUED_EMPLOYMENT          0.000000
CHANGE_PREVIOUS_EMPLOYMENT    0.000000
NEW_CONCURRENT_EMPLOYMENT     0.000000
CHANGE_EMPLOYER               0.000000
AMENDED_PETITION              0.000000
EMPLOYER_NAME                 0.000000
EMPLOYER_CITY                 0.000000
EMPLOYER_STATE                0.000160
EMPLOYER_COUNTRY              0.000160
WAGE_RATE_OF_PAY_FROM         0.000000
WAGE_RATE_OF_PAY_TO           0.676132
WAGE_UNIT_OF_PAY              0.000000
PREVAILING_WAGE               0.000000
PW_UNIT_OF_PAY                0.000000
H1B_DEPENDENT                 0.022257
dtype: float64