# Visa Category Mapping and Dimension Table

- https://www.pluralsight.com/guides/extracting-data-html-beautifulsoup
- https://travel.state.gov/content/travel/en/us-visas/visa-information-resources/all-visa-categories.html
- https://travel.state.gov/content/dam/visas/Statistics/AnnualReports/FY2020AnnualReport/FY20AnnualReport-TableXVA.pdf

In [1]:
# importing the libraries
import pandas as pd
import re
from pprint import pprint

from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', 80)

In [2]:
# specify url
url="https://travel.state.gov/content/travel/en/us-visas/visa-information-resources/all-visa-categories.html"

# Make a GET request to fetch the raw HTML content
html_content = requests.get(url).text

# Parse the html content
soup = BeautifulSoup(html_content, "html")
#print(soup.prettify()) # print the parsed data of html

In [3]:
# get box header visa category types

categories = soup.find_all("h4", attrs={"class": "tsg-rwd-accordion-header-name-frame"})
categories_list = [category.text.strip() for category in categories]
categories_list

['Nonimmigrant Visa Categories', 'Immigrant Visa Categories']

In [4]:
# obtain the two separate tables

tables = soup.find_all("table", attrs={"class": "grid"})

In [5]:
# generate immigrant visa categories from tables

def get_dataframe(table, visa_category):
    group = []
    data = []

    tr = table.tbody.find_all("tr")
    for i in range(len(tr)):
        td = tr[i].find_all("td")
        #print(len(td))

        if td[0].b != None:
            group.append(td[0].text.strip())
        else:
            data.append({'visa_category':visa_category,'visa_group':group[-1],'visa_desc':td[0].text.strip(), 'visa':td[1].text.strip() })

    return pd.DataFrame(data)

In [6]:
# non-immigrants, immigrants df
non_df = get_dataframe(tables[0], categories_list[0])
im_df = get_dataframe(tables[1], categories_list[1])

In [7]:
im_df

Unnamed: 0,visa_category,visa_group,visa_desc,visa
0,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Spouse of a U.S. Citizen,"IR1, CR1"
1,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Spouse of a U.S. Citizen awaiting approval of an I-130 immigrant petition,K-3 *
2,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Fiancé(e) to marry U.S. Citizen & live in U.S.,K-1 *
3,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Intercountry Adoption of Orphan Children by U.S. Citizens,"IR3, IH3, IR4, IH4"
4,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Certain Family Members of U.S. Citizens,"IR2, CR2, IR5, F1, F3, F4"
5,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Certain Family Members of Lawful Permanent Residents,"F2A, F2B"
6,Immigrant Visa Categories,Employer Sponsored – Employment,"Employment-Based Immigrants, including (preference group):\n\nPriority workers [First]\nProfessionals Holding Advanced Degrees and Persons of Exceptional Ability [Second]\nProfessionals and Other Workers [Third]\nEmployment Creation/Investors [Fifth]\nCertain Special Immigrants: [Fourth]","E1\nE2\n\n\n\nE3, EW3\n\n\nC5, T5, R5, I5\n\nS (many**)"
7,Immigrant Visa Categories,Employer Sponsored – Employment,Religious Workers,"SD, SR"
8,Immigrant Visa Categories,Employer Sponsored – Employment,Iraqi and Afghan Translators/Interpreters,SI
9,Immigrant Visa Categories,Employer Sponsored – Employment,Iraqis Who Worked for/on Behalf of the U.S. Government,SQ


- clean row 6: Employment Based Immigrants

In [8]:
# get relevant visa descriptions
visa_cat =  im_df.loc[6][0]
visa_group = im_df.loc[6][1]
visa_desc = im_df.loc[6][2].split('\n')
visa_desc_clean = visa_desc[2:]
#pprint(visa_desc_clean)

# get relevant visa codes
visa_code = im_df.loc[6][3].split('\n')
visa_code_clean = [x for x in visa_code if x != '' ]
#pprint(visa_code_clean)

# combine results in dataframe
im_df_6 = pd.DataFrame(list(zip([visa_cat] * 5 , [visa_group] * 5, visa_desc_clean,visa_code_clean)))
im_df_6.columns = ['visa_category', 'visa_group', 'visa_desc', 'visa']
im_df_6

Unnamed: 0,visa_category,visa_group,visa_desc,visa
0,Immigrant Visa Categories,Employer Sponsored – Employment,Priority workers [First],E1
1,Immigrant Visa Categories,Employer Sponsored – Employment,Professionals Holding Advanced Degrees and Persons of Exceptional Ability [Second],E2
2,Immigrant Visa Categories,Employer Sponsored – Employment,Professionals and Other Workers [Third],"E3, EW3"
3,Immigrant Visa Categories,Employer Sponsored – Employment,Employment Creation/Investors [Fifth],"C5, T5, R5, I5"
4,Immigrant Visa Categories,Employer Sponsored – Employment,Certain Special Immigrants: [Fourth],S (many**)


In [9]:
# append and delete index= 6
im_df_clean = im_df.append(im_df_6)
im_df_clean = im_df_clean.drop(6)

# combine immigrants and non-immigrants dataset
df = im_df_clean.append(non_df)


# split colums to rows with multiple visa codes
splitter = ",|\n|/"

l = []
for i, row in df.iterrows():
    visa_codes = re.split(splitter, row['visa'])
    for code in visa_codes:
        l.append((row['visa_category'], row['visa_group'], row['visa_desc'], code))
        

# generate new dataframe with splitted columns
df = pd.DataFrame(l)
df.columns = ['visa_category', 'visa_group', 'visa_desc', 'visa']


# clean visa codes and generate a mapping column to fit the format in the I94 data
df['visa_map'] = df['visa'].str.extract('([A-Z]{1,5}-?[A-Z0-9]{0,3}[A-Za-z\s\-]*)',expand=False).str.replace("-","").str.replace(" ","")
df.sample(5)

Unnamed: 0,visa_category,visa_group,visa_desc,visa,visa_map
44,Nonimmigrant Visa Categories,Purpose of Travel,Foreign military personnel stationed in the United States,A-2,A2
23,Immigrant Visa Categories,Employer Sponsored – Employment,Priority workers [First],E1,E1
21,Immigrant Visa Categories,Other Immigrants,Diversity Immigrant Visa,DV,DV
65,Nonimmigrant Visa Categories,Purpose of Travel,"Tourism, vacation, pleasure visitor",B-2,B2
14,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Certain Family Members of Lawful Permanent Residents,F2A,F2A


In [10]:
# find duplicates in visa_map and filter

df['duplicate_count'] = df.groupby(["visa_map"])['visa_desc'].transform('count')
df_duplicates = df[df['duplicate_count'] > 1].sort_values(by='visa_map')
df_duplicates

Unnamed: 0,visa_category,visa_group,visa_desc,visa,visa_map,duplicate_count
32,Nonimmigrant Visa Categories,Purpose of Travel,"Athlete, amateur or professional (competing for prize money only)",B-1,B1,3.0
36,Nonimmigrant Visa Categories,Purpose of Travel,Business visitor,B-1,B1,3.0
40,Nonimmigrant Visa Categories,Purpose of Travel,Domestic employee or nanny - must be accompanying a foreign national employer,B-1,B1,3.0
51,Nonimmigrant Visa Categories,Purpose of Travel,"Medical treatment, visitor for",B-2,B2,2.0
65,Nonimmigrant Visa Categories,Purpose of Travel,"Tourism, vacation, pleasure visitor",B-2,B2,2.0
25,Immigrant Visa Categories,Employer Sponsored – Employment,Professionals and Other Workers [Third],E3,E3,2.0
34,Nonimmigrant Visa Categories,Purpose of Travel,Australian professional specialty,E-3,E3,2.0
57,Nonimmigrant Visa Categories,Purpose of Travel,Physician,H-1B,H1B,2.0
60,Nonimmigrant Visa Categories,Purpose of Travel,Specialty occupations in fields requiring highly specialized knowledge,H-1B,H1B,2.0
33,Nonimmigrant Visa Categories,Purpose of Travel,Au pair (exchange visitor),J,J,4.0


In [11]:
# manually select the ones to keep (most generic descriptions)
# drop SQ, no differentiation

# keep
ind = [36, 65, 25, 60, 43]

# delete
ind_drop = [x for x in df_duplicates.index if x not in ind]
ind_drop

[32, 40, 51, 34, 57, 33, 56, 58, 19, 20]

In [12]:
# remove duplicates from dataframe
df = df.drop(ind_drop, axis=0)

# check duplicate elimination
(df.groupby(["visa_map"])['visa_desc'].count() > 1).sum()

0

In [13]:
# final cleaning cleaning

df = df.dropna()
df = df.drop('duplicate_count', axis=1)

# add dummy column if mapping is not found
df = pd.DataFrame.from_dict({'visa_category':['Unknown Visa Categories'],
                             'visa_group':['Unknown Visa'],
                             'visa_desc':['Unknown'],
                             'visa':[''],
                             'visa_map':['']}).append(df).reset_index(drop=True)

# add id used as a primary key
df['visa_id'] = df.index + 1

In [14]:
# replace line break
df['visa_desc'] = df['visa_desc'].str.replace("\n"," ")

In [15]:
# select relevant columns for mapping / dimension table

df = df[['visa_category','visa_group','visa_desc','visa_map','visa_id']]
df = df.rename(columns={'visa_map':'visa'})

In [16]:
df.head()

Unnamed: 0,visa_category,visa_group,visa_desc,visa,visa_id
0,Unknown Visa Categories,Unknown Visa,Unknown,,1
1,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Spouse of a U.S. Citizen,IR1,2
2,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Spouse of a U.S. Citizen,CR1,3
3,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Spouse of a U.S. Citizen awaiting approval of an I-130 immigrant petition,K3,4
4,Immigrant Visa Categories,Immediate Relative & Family Sponsored,Fiancé(e) to marry U.S. Citizen & live in U.S.,K1,5


In [17]:
# write out dimension table to final destination
df.to_csv('../staging/visa_categories.csv', sep=";", index=False)