# Cleanup and Exploration of 2020 AQ Data

In [1]:
import pandas as pd
import json as json

In [2]:
df = pd.read_csv('2020 AQ.csv')

In [3]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False


The entry is not valid json.  The field will need processing code.

In [4]:
def validate_json_fields(field: str) -> bool:
    """This function validates a JSON entry is valid.  Returns True if so, False if otherwise."""
    try:
        json.loads(field)
        return True
    except ValueError:
        return False

In [5]:
def validate_json_column(series: pd.Series):
    temp_df = pd.DataFrame()
    temp_df['values'] = series
    temp_df['validity'] = series.apply(validate_json_fields)
    return temp_df

## Validation of composite fields as json

In [6]:
output = validate_json_column(df['date'])

In [7]:
output.head()

Unnamed: 0,values,validity
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",False
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",False
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",False
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",False
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",False


In [8]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [9]:
output = validate_json_column(df['attribution'])
output.head()

Unnamed: 0,values,validity
0,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
1,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
2,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
3,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
4,"[{name=US EPA AirNow, url=http://www.airnow.go...",False


In [10]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [11]:
output = validate_json_column(df['averagingperiod'])
output.head()

Unnamed: 0,values,validity
0,"{unit=hours, value=1.0}",False
1,"{unit=hours, value=1.0}",False
2,"{unit=hours, value=1.0}",False
3,"{unit=hours, value=1.0}",False
4,"{unit=hours, value=1.0}",False


In [12]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [13]:
output = validate_json_column(df['coordinates'])
output.head()

Unnamed: 0,values,validity
0,"{latitude=36.7056, longitude=-119.7414}",False
1,"{latitude=36.7056, longitude=-119.7414}",False
2,"{latitude=36.1994, longitude=-120.0991}",False
3,"{latitude=36.6324, longitude=-120.3822}",False
4,"{latitude=36.6324, longitude=-120.3822}",False


In [14]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


All of the composite fields are in a "JSON-like format"  need to convert to JSON.

## Reprocessing the Multivalue Fields into Valid JSON

In [15]:
def correct_field_to_json(arg: str) -> str:
    string = arg.lstrip("{{")
    string = string.rstrip("]}")
    strings = string.split(",")
    new_strings = []
    for entry in strings:
        index = entry.find('=')
        entry = '"' + entry[0:index] + '"' + ':' + '"'+ entry[index+1:len(entry)]+'"'
        new_strings.append(entry)
    output_string = (',').join(entry for entry in new_strings)
    output_string = '{' + output_string + '}'
    return output_string

In [16]:
def correct_json_column(series: pd.Series, column_name: str) -> pd.DataFrame:
    temp_df = pd.DataFrame()
    temp_df[column_name] = series.apply(correct_field_to_json)
    return temp_df

In [17]:
test = correct_json_column(df['date'], 'test date conversion')
results = validate_json_column(test['test date conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [18]:
df['cleaneddate'] = test

In [19]:
test = correct_json_column(df['attribution'], 'test attribute conversion')
results = validate_json_column(test['test attribute conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [20]:
results.head()

Unnamed: 0,values,validity
0,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
1,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
2,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
3,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
4,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True


In [21]:
df['cleanedattribution'] = test

In [22]:
test = correct_json_column(df['averagingperiod'], 'test averaging period conversion')
results = validate_json_column(test['test averaging period conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [23]:
df['cleanedaveragingperiod'] = test

In [24]:
test = correct_json_column(df['coordinates'], 'test coordinate conversion')
results = validate_json_column(test['test coordinate conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [25]:
df['cleanedcoordinates'] = test

In [26]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}"
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}"
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}"
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}"
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}"


In [27]:
import ujson as ujson

In [28]:
#df_trans = pd.concat([df_trans, df_trans.cleanedcoordinates.apply(json.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df['cleanedcoordinates'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df = pd.concat([df, expanded_datascope], axis =1)

In [29]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822


In [30]:
#df_trans = pd.concat([df_trans, df_trans.cleaneddate.apply(ujson.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df['cleaneddate'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df = pd.concat([df, expanded_datascope], axis =1)

In [31]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00


In [32]:
#df_trans = pd.concat([df_trans, df_trans.cleanedattribution.apply(json.loads).apply(pd.Series)], axis = 1) This line is EXTREMELY SLOW TRYING TO OPTIMIZE.
expanded_datascope = df['cleanedattribution'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df = pd.concat([df, expanded_datascope], axis =1)

In [33]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,cleanedcoordinates,latitude,longitude,utc,local,[{name,url,{name,Great Lake,and Energ
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD,,
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD,,
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,"{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD,,
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD,,
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD,,


In [34]:
df.drop(' {name', axis=1, inplace=True)

In [35]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,


In [36]:
#df_trans = pd.concat([df_trans, df_trans.cleanedaveragingperiod.apply(json.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df['cleanedaveragingperiod'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df = pd.concat([df, expanded_datascope], axis =1)

In [37]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ,unit.1,value.1
0,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,"{utc=2020-02-06T17:00:00.000Z, local=2020-02-0...",pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [38]:
df.drop('date', axis =1, inplace=True)

In [39]:
df.head()

Unnamed: 0,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,...,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ,unit.1,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,...,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [40]:
df.drop(['attribution','averagingperiod','coordinates','cleanedaveragingperiod','cleanedcoordinates'], axis = 1, inplace = True)

In [41]:
df.head()

Unnamed: 0,parameter,location,value,unit,city,country,sourcename,sourcetype,mobile,cleaneddate,...,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ,unit.1,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...",...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...",...,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...",...,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...",...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,"{""utc"":""2020-02-06T17:00:00.000Z"","" local"":""20...",...,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [42]:
df.drop(['cleaneddate','cleanedattribution'], axis = 1, inplace = True)

In [43]:
df.head()

Unnamed: 0,parameter,location,value,unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ,unit.1,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [44]:
#df_trans.rename(columns={df_trans.columns[15]: 'averaging time unit'}, inplace=True)
df.columns.values[3]='concentration unit'

In [45]:
df.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,[{name,url,Great Lake,and Energ,unit,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [46]:
#df_trans.rename(columns={df_trans.columns[15]: 'averaging time unit'}, inplace=True)
df.columns.values[15] = 'averaging time unit'

In [47]:
#df_trans.rename(columns={df_trans.columns[13]: 'source'}, inplace=True)
df.columns.values[13] = 'source'

In [48]:
df.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,averaging time unit,and Energ,unit,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [49]:
#df_trans.rename(columns={df_trans.columns[16]: 'averaging time'},inplace=True)
df.columns.values[16] = 'averaging time'

In [50]:
df.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,averaging time unit,averaging time,unit,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,,,hours,1.0


In [51]:
df.to_csv('2020 AQ Clean.csv', index=False)

In [52]:
df.drop(['averaging time unit', 'averaging time'], axis =1, inplace=True)
df.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,unit,value.1
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0


In [53]:
df.columns.values[15] = 'averaging time unit'
df.columns.values[16] = 'averaging time'
df.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,averaging time unit,averaging time
0,no2,Fresno - Drummond St,0.019,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
1,o3,Fresno - Drummond St,0.014,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
2,pm25,Huron,4.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
3,o3,Tranquillity,0.017,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0
4,pm25,Tranquillity,16.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2020-02-06T17:00:00.000Z,2020-02-06T09:00:00-08:00,US EPA AirNow,,hours,1.0


In [54]:
df.to_csv('2020 AQ Clean.csv', index=False)