In [1]:
import os
import urllib
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
def download_data_zip(data_dir, zip_file):
    data_url = 'https://echo.epa.gov/files/echodownloads/SDWA_latest_downloads.zip'
    urllib.request.urlretrieve(data_url, zip_file)

In [3]:
from zipfile import ZipFile
def extract_file(zip_file, data_dir):
    with ZipFile(zip_file, 'r') as zipObj:
        zipObj.extractall(data_dir)

In [4]:
# creating data directory if not already exists
data_dir = 'data'
csv_dir = data_dir+'/csv'
zip_file = data_dir + "/data.zip"
files = ['SDWA_GEOGRAPHIC_AREAS.csv', 'SDWA_PN_VIOLATION_ASSOC.csv', 'SDWA_EVENTS_MILESTONES.csv', 'SDWA_PUB_WATER_SYSTEMS.csv', 'SDWA_SITE_VISITS.csv', 'SDWA_FACILITIES.csv', 'SDWA_LCR_SAMPLES.csv', 'SDWA_REF_CODE_VALUES.csv', 'SDWA_SERVICE_AREAS.csv', 'SDWA_REF_ANSI_AREAS.csv', 'SDWA_VIOLATIONS_ENFORCEMENT.csv']

try:
    os.mkdir(data_dir)
except FileExistsError:
    pass    
if not all([file in os.listdir(csv_dir) for file in files]):
    if zip_file not in os.listdir(data_dir):
        download_data_zip(data_dir, zip_file)
    extract_file(zip_file, csv_dir)

print(os.listdir(csv_dir))

['SDWA_GEOGRAPHIC_AREAS.csv', 'SDWA_PN_VIOLATION_ASSOC.csv', 'SDWA_EVENTS_MILESTONES.csv', 'SDWA_PUB_WATER_SYSTEMS.csv', '.ipynb_checkpoints', 'SDWA_SITE_VISITS.csv', 'SDWA_FACILITIES.csv', 'SDWA_LCR_SAMPLES.csv', 'SDWA_REF_CODE_VALUES.csv', 'SDWA_SERVICE_AREAS.csv', 'SDWA_REF_ANSI_AREAS.csv', 'SDWA_VIOLATIONS_ENFORCEMENT.csv']


In [41]:
file_path = 'data/csv/SDWA_VIOLATIONS_ENFORCEMENT.csv'
violation_columns = ['SUBMISSIONYEARQUARTER', 'PWSID', 'VIOLATION_ID', 'COMPL_PER_BEGIN_DATE', 'COMPL_PER_END_DATE', 'VIOLATION_CODE','VIOLATION_CATEGORY_CODE', 'IS_HEALTH_BASED_IND', 'CONTAMINANT_CODE','VIOL_MEASURE', 'UNIT_OF_MEASURE', 'FEDERAL_MCL', 'STATE_MCL','IS_MAJOR_VIOL_IND', 'SEVERITY_IND_CNT', 'CALCULATED_RTC_DATE','VIOLATION_STATUS', 'PUBLIC_NOTIFICATION_TIER','CALCULATED_PUB_NOTIF_TIER', 'VIOL_ORIGINATOR_CODE', 'SAMPLE_RESULT_ID','CORRECTIVE_ACTION_ID', 'RULE_CODE', 'RULE_GROUP_CODE','RULE_FAMILY_CODE', 'VIOL_FIRST_REPORTED_DATE','VIOL_LAST_REPORTED_DATE']
violations = pd.read_csv(file_path, usecols=violation_columns, skipinitialspace=True, dtype=str)
violations.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,VIOLATION_ID,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,VIOLATION_CODE,VIOLATION_CATEGORY_CODE,IS_HEALTH_BASED_IND,CONTAMINANT_CODE,VIOL_MEASURE,...,PUBLIC_NOTIFICATION_TIER,CALCULATED_PUB_NOTIF_TIER,VIOL_ORIGINATOR_CODE,SAMPLE_RESULT_ID,CORRECTIVE_ACTION_ID,RULE_CODE,RULE_GROUP_CODE,RULE_FAMILY_CODE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
0,2021Q2,AK2218644,6111405,01/01/2000,12/31/2004,28,Other,N,,,...,3,3,S,,,110,100,110,03/02/2005,12/13/2016
1,2021Q2,AK2218644,6111505,04/01/2005,06/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/24/2005,12/13/2016
2,2021Q2,AK2218644,6111606,07/01/2005,09/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,12/27/2005,12/13/2016
3,2021Q2,AK2218644,6111806,04/01/2006,06/30/2006,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/29/2006,12/13/2016
4,2021Q2,AK2218644,6112107,04/01/2007,06/30/2007,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/15/2007,08/17/2017


In [42]:
violations.SUBMISSIONYEARQUARTER.value_counts()

2021Q2    13247718
Name: SUBMISSIONYEARQUARTER, dtype: int64

In [43]:
print('Total Number of records in violations data is {}'.format(violations.shape[0]))
print('Total Number of duplicate records in violations data is {}'.format(violations[violations.duplicated(keep='first')].shape[0]))

Total Number of records in violations data is 13247718
Total Number of duplicate records in violations data is 8433228


### Removing duplicate rows based on all columns

In [44]:
violations = violations.drop_duplicates()
violations.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,VIOLATION_ID,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,VIOLATION_CODE,VIOLATION_CATEGORY_CODE,IS_HEALTH_BASED_IND,CONTAMINANT_CODE,VIOL_MEASURE,...,PUBLIC_NOTIFICATION_TIER,CALCULATED_PUB_NOTIF_TIER,VIOL_ORIGINATOR_CODE,SAMPLE_RESULT_ID,CORRECTIVE_ACTION_ID,RULE_CODE,RULE_GROUP_CODE,RULE_FAMILY_CODE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
0,2021Q2,AK2218644,6111405,01/01/2000,12/31/2004,28,Other,N,,,...,3,3,S,,,110,100,110,03/02/2005,12/13/2016
1,2021Q2,AK2218644,6111505,04/01/2005,06/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/24/2005,12/13/2016
2,2021Q2,AK2218644,6111606,07/01/2005,09/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,12/27/2005,12/13/2016
3,2021Q2,AK2218644,6111806,04/01/2006,06/30/2006,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/29/2006,12/13/2016
4,2021Q2,AK2218644,6112107,04/01/2007,06/30/2007,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/15/2007,08/17/2017


In [45]:
violations.shape

(4814490, 27)

## Missing Values

In [46]:
violations.isnull().sum()

SUBMISSIONYEARQUARTER              0
PWSID                              0
VIOLATION_ID                  126758
COMPL_PER_BEGIN_DATE          126758
COMPL_PER_END_DATE            569915
VIOLATION_CODE                126758
VIOLATION_CATEGORY_CODE       126758
IS_HEALTH_BASED_IND           126758
CONTAMINANT_CODE              224248
VIOL_MEASURE                 3854711
UNIT_OF_MEASURE              4682893
FEDERAL_MCL                   377741
STATE_MCL                    3899612
IS_MAJOR_VIOL_IND            1918020
SEVERITY_IND_CNT             4759716
CALCULATED_RTC_DATE           169706
VIOLATION_STATUS              126758
PUBLIC_NOTIFICATION_TIER      126758
CALCULATED_PUB_NOTIF_TIER     126758
VIOL_ORIGINATOR_CODE          126758
SAMPLE_RESULT_ID             4790827
CORRECTIVE_ACTION_ID         4804589
RULE_CODE                     126758
RULE_GROUP_CODE               126758
RULE_FAMILY_CODE              126758
VIOL_FIRST_REPORTED_DATE      126767
VIOL_LAST_REPORTED_DATE       168038
d

On looking at the data, I could see there are huge number of records that miss values for all columns except PWSID and SUBMISSIONYEARQUARTER.

In [47]:
violations[violations.iloc[:, 2:].isnull().all(axis=1)]

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,VIOLATION_ID,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,VIOLATION_CODE,VIOLATION_CATEGORY_CODE,IS_HEALTH_BASED_IND,CONTAMINANT_CODE,VIOL_MEASURE,...,PUBLIC_NOTIFICATION_TIER,CALCULATED_PUB_NOTIF_TIER,VIOL_ORIGINATOR_CODE,SAMPLE_RESULT_ID,CORRECTIVE_ACTION_ID,RULE_CODE,RULE_GROUP_CODE,RULE_FAMILY_CODE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
12312464,2021Q2,IL0735586,,,,,,,,,...,,,,,,,,,,
12312465,2021Q2,IL0710350,,,,,,,,,...,,,,,,,,,,
12312490,2021Q2,IL0935150,,,,,,,,,...,,,,,,,,,,
12312491,2021Q2,IL0910070,,,,,,,,,...,,,,,,,,,,
12312493,2021Q2,IL0910200,,,,,,,,,...,,,,,,,,,,
12312496,2021Q2,IL0970130,,,,,,,,,...,,,,,,,,,,
12312522,2021Q2,IL1030200,,,,,,,,,...,,,,,,,,,,
12312523,2021Q2,IL1075145,,,,,,,,,...,,,,,,,,,,
12312532,2021Q2,IL1110400,,,,,,,,,...,,,,,,,,,,
12312538,2021Q2,IL1110600,,,,,,,,,...,,,,,,,,,,


126758 rows miss data for all the important columns for violation information. Hence removing these rows.

In [48]:
violations = violations[violations.iloc[:, 2:].notnull().any(axis=1)]
print('Total Number of records in violations data after missing values is {}'.format(violations.shape[0]))    # 11124441 - 322094 = 10802347

Total Number of records in violations data after missing values is 4687732


In [49]:
violations.isnull().sum()

SUBMISSIONYEARQUARTER              0
PWSID                              0
VIOLATION_ID                       0
COMPL_PER_BEGIN_DATE               0
COMPL_PER_END_DATE            443157
VIOLATION_CODE                     0
VIOLATION_CATEGORY_CODE            0
IS_HEALTH_BASED_IND                0
CONTAMINANT_CODE               97490
VIOL_MEASURE                 3727953
UNIT_OF_MEASURE              4556135
FEDERAL_MCL                   250983
STATE_MCL                    3772854
IS_MAJOR_VIOL_IND            1791262
SEVERITY_IND_CNT             4632958
CALCULATED_RTC_DATE            42948
VIOLATION_STATUS                   0
PUBLIC_NOTIFICATION_TIER           0
CALCULATED_PUB_NOTIF_TIER          0
VIOL_ORIGINATOR_CODE               0
SAMPLE_RESULT_ID             4664069
CORRECTIVE_ACTION_ID         4677831
RULE_CODE                          0
RULE_GROUP_CODE                    0
RULE_FAMILY_CODE                   0
VIOL_FIRST_REPORTED_DATE           9
VIOL_LAST_REPORTED_DATE        41280
d

In [50]:
violations.columns

Index(['SUBMISSIONYEARQUARTER', 'PWSID', 'VIOLATION_ID',
       'COMPL_PER_BEGIN_DATE', 'COMPL_PER_END_DATE', 'VIOLATION_CODE',
       'VIOLATION_CATEGORY_CODE', 'IS_HEALTH_BASED_IND', 'CONTAMINANT_CODE',
       'VIOL_MEASURE', 'UNIT_OF_MEASURE', 'FEDERAL_MCL', 'STATE_MCL',
       'IS_MAJOR_VIOL_IND', 'SEVERITY_IND_CNT', 'CALCULATED_RTC_DATE',
       'VIOLATION_STATUS', 'PUBLIC_NOTIFICATION_TIER',
       'CALCULATED_PUB_NOTIF_TIER', 'VIOL_ORIGINATOR_CODE', 'SAMPLE_RESULT_ID',
       'CORRECTIVE_ACTION_ID', 'RULE_CODE', 'RULE_GROUP_CODE',
       'RULE_FAMILY_CODE', 'VIOL_FIRST_REPORTED_DATE',
       'VIOL_LAST_REPORTED_DATE'],
      dtype='object')

In [51]:
sum(violations.groupby(['PWSID', 'VIOLATION_ID']).size().tolist())

4687732

{PWSID, VIOLAITON_ID} identifies each rows uniquely. It could be primary key for this table

In [52]:
violations.to_csv('VIOLATION_TABLE.csv', index=False)

In [2]:
violations = pd.read_csv('../data/VIOLATION_TABLE.csv', dtype=str)
violations.head()

Unnamed: 0,SUBMISSIONYEARQUARTER,PWSID,VIOLATION_ID,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,VIOLATION_CODE,VIOLATION_CATEGORY_CODE,IS_HEALTH_BASED_IND,CONTAMINANT_CODE,VIOL_MEASURE,...,PUBLIC_NOTIFICATION_TIER,CALCULATED_PUB_NOTIF_TIER,VIOL_ORIGINATOR_CODE,SAMPLE_RESULT_ID,CORRECTIVE_ACTION_ID,RULE_CODE,RULE_GROUP_CODE,RULE_FAMILY_CODE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
0,2021Q2,AK2218644,6111405,01/01/2000,12/31/2004,28,Other,N,,,...,3,3,S,,,110,100,110,03/02/2005,12/13/2016
1,2021Q2,AK2218644,6111505,04/01/2005,06/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/24/2005,12/13/2016
2,2021Q2,AK2218644,6111606,07/01/2005,09/30/2005,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,12/27/2005,12/13/2016
3,2021Q2,AK2218644,6111806,04/01/2006,06/30/2006,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/29/2006,12/13/2016
4,2021Q2,AK2218644,6112107,04/01/2007,06/30/2007,23,MR,N,3100.0,,...,3,3,S,,,110,100,110,08/15/2007,08/17/2017


In [3]:
## Seeing date formats
date_columns = ['COMPL_PER_BEGIN_DATE', 'COMPL_PER_END_DATE', 'CALCULATED_RTC_DATE','VIOL_FIRST_REPORTED_DATE', 'VIOL_LAST_REPORTED_DATE']
violations[date_columns].head()

Unnamed: 0,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,CALCULATED_RTC_DATE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
0,01/01/2000,12/31/2004,12/31/2004,03/02/2005,12/13/2016
1,04/01/2005,06/30/2005,06/30/2005,08/24/2005,12/13/2016
2,07/01/2005,09/30/2005,09/30/2005,12/27/2005,12/13/2016
3,04/01/2006,06/30/2006,06/30/2006,08/29/2006,12/13/2016
4,04/01/2007,06/30/2007,06/30/2007,08/15/2007,08/17/2017


In [4]:
for datecol in date_columns:
    violations[datecol] = pd.to_datetime(violations[datecol], format='%m/%d/%Y')

In [5]:
violations[date_columns].head(10)

Unnamed: 0,COMPL_PER_BEGIN_DATE,COMPL_PER_END_DATE,CALCULATED_RTC_DATE,VIOL_FIRST_REPORTED_DATE,VIOL_LAST_REPORTED_DATE
0,2000-01-01,2004-12-31,2004-12-31,2005-03-02,2016-12-13
1,2005-04-01,2005-06-30,2005-06-30,2005-08-24,2016-12-13
2,2005-07-01,2005-09-30,2005-09-30,2005-12-27,2016-12-13
3,2006-04-01,2006-06-30,2006-06-30,2006-08-29,2016-12-13
4,2007-04-01,2007-06-30,2007-06-30,2007-08-15,2017-08-17
5,2007-07-01,2007-09-30,2007-09-30,2008-05-21,2017-08-17
6,2010-07-01,2010-09-30,2010-09-30,2010-11-18,2020-08-19
7,2003-01-01,2003-12-31,2003-12-31,2004-08-21,2016-12-13
8,2005-01-01,2005-12-31,2005-12-31,2006-02-28,2016-12-13
9,2006-01-01,2006-12-31,2006-12-31,2007-02-28,2016-12-13


In [6]:
for datacol in date_columns:
    print('Range of {} is between {} and {}'.format(datacol, violations[datacol].min(), violations[datacol].max()))

Range of COMPL_PER_BEGIN_DATE is between 1900-01-01 00:00:00 and 2021-06-16 00:00:00
Range of COMPL_PER_END_DATE is between 1900-01-31 00:00:00 and 2030-12-31 00:00:00
Range of CALCULATED_RTC_DATE is between 1900-01-31 00:00:00 and 2021-06-25 00:00:00
Range of VIOL_FIRST_REPORTED_DATE is between 1980-09-30 00:00:00 and 2021-07-01 00:00:00
Range of VIOL_LAST_REPORTED_DATE is between 1980-09-30 00:00:00 and 2021-07-01 00:00:00


In [45]:
# violations.to_csv('processed_data/VIOLATIONS.csv', index=False)

In [11]:
violations.sort_values(['PWSID', 'VIOLATION_ID'], ascending=[True, True]).to_csv('../data/processed_data/VIOLATIONS.csv', index=False) # violations.groupby(['PWSID', 'VIOLATION_ID']).size()

In [9]:
violations.groupby('PWSID')['VIOLATION_ID'].nunique().sort_values(ascending=False).to_frame('count').head()

Unnamed: 0_level_0,count
PWSID,Unnamed: 1_level_1
WV3305535,4947
WV3305536,4338
WV3301040,3942
WV3302451,3390
PA2400148,3030


In [10]:
violations.columns

Index(['SUBMISSIONYEARQUARTER', 'PWSID', 'VIOLATION_ID',
       'COMPL_PER_BEGIN_DATE', 'COMPL_PER_END_DATE', 'VIOLATION_CODE',
       'VIOLATION_CATEGORY_CODE', 'IS_HEALTH_BASED_IND', 'CONTAMINANT_CODE',
       'VIOL_MEASURE', 'UNIT_OF_MEASURE', 'FEDERAL_MCL', 'STATE_MCL',
       'IS_MAJOR_VIOL_IND', 'SEVERITY_IND_CNT', 'CALCULATED_RTC_DATE',
       'VIOLATION_STATUS', 'PUBLIC_NOTIFICATION_TIER',
       'CALCULATED_PUB_NOTIF_TIER', 'VIOL_ORIGINATOR_CODE', 'SAMPLE_RESULT_ID',
       'CORRECTIVE_ACTION_ID', 'RULE_CODE', 'RULE_GROUP_CODE',
       'RULE_FAMILY_CODE', 'VIOL_FIRST_REPORTED_DATE',
       'VIOL_LAST_REPORTED_DATE'],
      dtype='object')

### Web Scraping Column description and data type

In [11]:
# {list(violations.columns)}
def get_html():
    URL = "https://echo.epa.gov/tools/data-downloads/sdwa-download-summary"
    r = requests.get(URL)  
    html_soup = BeautifulSoup(r.content, 'html.parser') 
    return html_soup
html_soup = get_html()

In [12]:
cols = []
for th in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[-1].find_next_sibling('table').find("thead").find_all("th"):
    cols.append(th.get_text().replace(u'\xa0', u''))
rows = []
for row in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('h3')[-1].find_next_sibling('table').find("tbody").find_all("tr"):
    r = []
    for td in row.find_all('td'):
        r.append(td.get_text().replace('RTC_DATE', 'CALCULATED_RTC_DATE').replace(u'\xa0', u''))
    rows.append(r)
column_datatype = pd.DataFrame(rows, columns=cols)
column_datatype = column_datatype[column_datatype.Element.isin(violations.columns)]
column_datatype

Unnamed: 0,Element,Data Type,Length
0,PWSID,Char,9.0
1,SUBMISSIONYEARQUARTER,Char,6.0
2,VIOLATION_ID,Num,20.0
4,COMPL_PER_BEGIN_DATE,Date,
5,COMPL_PER_END_DATE,Date,
6,VIOLATION_CODE,Char,4.0
7,VIOLATION_CATEGORY_CODE,Char,5.0
8,IS_HEALTH_BASED_IND,Char,1.0
9,CONTAMINANT_CODE,Char,
10,VIOL_MEASURE,Num,15.0


In [42]:
data_dictionary = {
    p.find_all('strong')[0].getText().replace(u'\xa0', u''): p.getText().replace(u'\xa0', u' ').replace('RTC_DATE', 'CALCULATED_RTC_DATE') 
    for p in html_soup.find_all("div", {"class": 'field-item even'})[0].find_all('p') 
    if p.find_all('strong') and len(p.find_all('strong')) == 1 and p.find_all('strong')[0].getText().replace(u'\xa0', u'') in list(violations.columns) + ['RTC_DATE']
}
data_dictionary['COMPL_PER_BEGIN_DATE'] = 'COMPL_PER_BEGIN_DATE- START of the compliance period.'
data_dictionary['IS_HEALTH_BASED_IND'] = 'IS_HEALTH_BASED- Indicates if this is a health based violation. See also HEALTH_BASED.'
data_dictionary['CALCULATED_RTC_DATE'] = data_dictionary['RTC_DATE']
data_dictionary.pop('RTC_DATE')
data_dictionary['CALCULATED_PUB_NOTIF_TIER'] = 'PUBLIC_NOTIFICATION_TIER- Numeric code for Public Notification Tier for the violation.'

pd.set_option('display.max_colwidth', None)
col_desc = pd.DataFrame([data_dictionary[col] for col in data_dictionary], columns=['raw_desc'])

col_desc[['COLUMN', 'DESCRIPTION']] = col_desc['raw_desc'].str.split('-', 1, expand=True)
col_desc = col_desc[['COLUMN', 'DESCRIPTION']]
col_desc

Unnamed: 0,COLUMN,DESCRIPTION
0,CONTAMINANT_CODE,A code value that represents a contaminant for which a public water system has incurred a violation of a primary drinking water regulation. A full description of the codes can be accessed in the SDWA_REF_CODE_VALUES.csv.
1,COMPL_PER_END_DATE,"End of the compliance period. Note that for open-ended compliance periods, the COMPL_PER_END_DATE is listed as blank (MM/DD/YYYY format)."
2,CORRECTIVE_ACTION_ID,Corrective action identifier
3,FEDERAL_MCL,A numeric value that represents the maximum contaminant level which was exceeded that led to the identification of an MCL violation for a public water system.
4,IS_MAJOR_VIOL_IND,A code value that indicates whether a Monitoring and Reporting (MR) violation is major or minor. The major versus minor designation does not apply to a sanitary survey MR violation.
5,PUBLIC_NOTIFICATION_TIER,Numeric code for Public Notification Tier for the violation.
6,PWSID,"A unique identifying code for a public water system in SDWIS. The PWSID consists of a two-letter state or region code, followed by seven digits."
7,RULE_CODE,Code for a National Drinking Water rule.
8,RULE_FAMILY_CODE,Code for rule family.
9,RULE_GROUP_CODE,Code that uniquely identifies a rule group.


### Generating create SQL statement to create 'violation' table to store violation details

In [46]:
temp_dict = {}

def space(n):
    return ' ' * (max([len(col) for col in violations.columns]) + 5 - n)

for ind in column_datatype.index:
    col_name = column_datatype['Element'][ind]
    data_type = column_datatype['Data Type'][ind]
    length = column_datatype['Length'][ind]
    data_type = 'VARCHAR' if data_type == 'Char' else 'DATE' if data_type == 'Date' else 'INT'
    data_type = data_type + '('+ length +')' if length != '' and data_type == 'VARCHAR' else data_type + '(255)' if data_type == 'VARCHAR' else data_type
    temp_dict[col_name] = data_type
print('CREATE TABLE violation (')
for col in violations.columns:
    print(f'\t{col}{space(len(col))}{temp_dict[col]}', end='')
    if col != violations.columns[-1]:
        print(',')
    else:
        print()
print(') ENGINE = InnoDB;')

CREATE TABLE violation (
	SUBMISSIONYEARQUARTER         VARCHAR(6),
	PWSID                         VARCHAR(9),
	VIOLATION_ID                  INT,
	COMPL_PER_BEGIN_DATE          DATE,
	COMPL_PER_END_DATE            DATE,
	VIOLATION_CODE                VARCHAR(4),
	VIOLATION_CATEGORY_CODE       VARCHAR(5),
	IS_HEALTH_BASED_IND           VARCHAR(1),
	CONTAMINANT_CODE              VARCHAR(255),
	VIOL_MEASURE                  INT,
	UNIT_OF_MEASURE               VARCHAR(19),
	FEDERAL_MCL                   VARCHAR(255),
	STATE_MCL                     VARCHAR(15),
	IS_MAJOR_VIOL_IND             VARCHAR(1),
	SEVERITY_IND_CNT              INT,
	CALCULATED_RTC_DATE           DATE,
	VIOLATION_STATUS              VARCHAR(11),
	PUBLIC_NOTIFICATION_TIER      INT,
	CALCULATED_PUB_NOTIF_TIER     INT,
	VIOL_ORIGINATOR_CODE          VARCHAR(4),
	SAMPLE_RESULT_ID              INT,
	CORRECTIVE_ACTION_ID          INT,
	RULE_CODE                     INT,
	RULE_GROUP_CODE               INT,
	RULE_FAMILY_CODE

In [52]:
columns = violations.columns.to_list()
date_columns = ['COMPL_PER_BEGIN_DATE', 'COMPL_PER_END_DATE', 'CALCULATED_RTC_DATE','VIOL_FIRST_REPORTED_DATE', 'VIOL_LAST_REPORTED_DATE']
print('''
LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/VIOLATIONS.csv'
INTO TABLE violation
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\\n'
IGNORE 1 ROWS
''', end='')
print('(', end='')
for col in columns:
    print(f'@{col}', end='')
    if col != columns[-1]:
        print(',', end='')
print(')')
print('SET')
for col in columns:
    if col in date_columns:
        print(f'{col} = IF(@{col} = \'\', NULL, STR_TO_DATE(@{col}, \'%m/%d/%Y\')),')
    else:
        print(f'{col} = IF(@{col} = \'\', NULL, @{col}),')


LOAD DATA INFILE 'C:/ProgramData/MySQL/MySQL Server 8.0/Uploads/VIOLATIONS.csv'
INTO TABLE violation
FIELDS TERMINATED BY ','
ESCAPED BY ''
OPTIONALLY ENCLOSED BY '"'
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(@SUBMISSIONYEARQUARTER,@PWSID,@VIOLATION_ID,@COMPL_PER_BEGIN_DATE,@COMPL_PER_END_DATE,@VIOLATION_CODE,@VIOLATION_CATEGORY_CODE,@IS_HEALTH_BASED_IND,@CONTAMINANT_CODE,@VIOL_MEASURE,@UNIT_OF_MEASURE,@FEDERAL_MCL,@STATE_MCL,@IS_MAJOR_VIOL_IND,@SEVERITY_IND_CNT,@CALCULATED_RTC_DATE,@VIOLATION_STATUS,@PUBLIC_NOTIFICATION_TIER,@CALCULATED_PUB_NOTIF_TIER,@VIOL_ORIGINATOR_CODE,@SAMPLE_RESULT_ID,@CORRECTIVE_ACTION_ID,@RULE_CODE,@RULE_GROUP_CODE,@RULE_FAMILY_CODE,@VIOL_FIRST_REPORTED_DATE,@VIOL_LAST_REPORTED_DATE)
SET
SUBMISSIONYEARQUARTER = IF(@SUBMISSIONYEARQUARTER = '', NULL, @SUBMISSIONYEARQUARTER),
PWSID = IF(@PWSID = '', NULL, @PWSID),
VIOLATION_ID = IF(@VIOLATION_ID = '', NULL, @VIOLATION_ID),
COMPL_PER_BEGIN_DATE = IF(@COMPL_PER_BEGIN_DATE = '', NULL, STR_TO_DATE(@COMPL_PER_BEGIN_DAT