# Cleanup and Exploration of 2017 AQ Data

In [5]:
import pandas as pd
import json as json

In [6]:
df = pd.read_csv('2017 AQ.csv')

In [7]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,AirNow,government,False


The entry is not valid json.  The field will need processing code.

In [8]:
def validate_json_fields(field: str) -> bool:
    """This function validates a JSON entry is valid.  Returns True if so, False if otherwise."""
    try:
        json.loads(field)
        return True
    except ValueError:
        return False

In [9]:
def validate_json_column(series: pd.Series):
    temp_df = pd.DataFrame()
    temp_df['values'] = series
    temp_df['validity'] = series.apply(validate_json_fields)
    return temp_df

## Validation of composite fields as json

In [10]:
output = validate_json_column(df['date'])

In [11]:
output.head()

Unnamed: 0,values,validity
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",False
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",False
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",False
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",False
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",False


In [13]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [14]:
output = validate_json_column(df['attribution'])
output.head()

Unnamed: 0,values,validity
0,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
1,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
2,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
3,"[{name=US EPA AirNow, url=http://www.airnow.go...",False
4,"[{name=US EPA AirNow, url=http://www.airnow.go...",False


In [15]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [16]:
output = validate_json_column(df['averagingperiod'])
output.head()

Unnamed: 0,values,validity
0,"{unit=hours, value=1.0}",False
1,"{unit=hours, value=1.0}",False
2,"{unit=hours, value=1.0}",False
3,"{unit=hours, value=1.0}",False
4,"{unit=hours, value=1.0}",False


In [17]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [18]:
output = validate_json_column(df['coordinates'])
output.head()

Unnamed: 0,values,validity
0,"{latitude=36.7056, longitude=-119.7414}",False
1,"{latitude=36.1994, longitude=-120.0991}",False
2,"{latitude=36.6324, longitude=-120.3822}",False
3,"{latitude=36.6324, longitude=-120.3822}",False
4,"{latitude=36.5975, longitude=-119.5036}",False


In [19]:
output[output['validity'] == True].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


All of the composite fields are in a "JSON-like format"  need to convert to JSON.

## Reprocessing the Multivalue Fields into Valid JSON

In [20]:
def correct_field_to_json(arg: str) -> str:
    string = arg.lstrip("{{")
    string = string.rstrip("]}")
    strings = string.split(",")
    new_strings = []
    for entry in strings:
        index = entry.find('=')
        entry = '"' + entry[0:index] + '"' + ':' + '"'+ entry[index+1:len(entry)]+'"'
        new_strings.append(entry)
    output_string = (',').join(entry for entry in new_strings)
    output_string = '{' + output_string + '}'
    return output_string

In [21]:
def correct_json_column(series: pd.Series, column_name: str) -> pd.DataFrame:
    temp_df = pd.DataFrame()
    temp_df[column_name] = series.apply(correct_field_to_json)
    return temp_df

In [22]:
test = correct_json_column(df['date'], 'test date conversion')
results = validate_json_column(test['test date conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [23]:
df['cleaneddate'] = test

In [24]:
test = correct_json_column(df['attribution'], 'test attribute conversion')
results = validate_json_column(test['test attribute conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [25]:
results.head()

Unnamed: 0,values,validity
0,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
1,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
2,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
3,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True
4,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",True


In [26]:
df['cleanedattribution'] = test

In [27]:
test = correct_json_column(df['averagingperiod'], 'test averaging period conversion')
results = validate_json_column(test['test averaging period conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [28]:
df['cleanedaveragingperiod'] = test

In [29]:
test = correct_json_column(df['coordinates'], 'test coordinate conversion')
results = validate_json_column(test['test coordinate conversion'])
results[results['validity'] == False].describe()

Unnamed: 0,values,validity
count,0.0,0.0
unique,0.0,0.0
top,,
freq,,


In [30]:
df['cleanedcoordinates'] = test

In [31]:
df.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}"
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}"
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}"
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}"
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}"


In [32]:
df_trans = df
import ujson as ujson

In [33]:
#df_trans = pd.concat([df_trans, df_trans.cleanedcoordinates.apply(json.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df_trans['cleanedcoordinates'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df_trans = pd.concat([df_trans, expanded_datascope], axis =1)

In [34]:
df_trans.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036


In [35]:
#df_trans = pd.concat([df_trans, df_trans.cleaneddate.apply(ujson.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df_trans['cleaneddate'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df_trans = pd.concat([df_trans, expanded_datascope], axis =1)

In [36]:
df_trans.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,sourcetype,mobile,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,...,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00


In [37]:
#df_trans = pd.concat([df_trans, df_trans.cleanedattribution.apply(json.loads).apply(pd.Series)], axis = 1) This line is EXTREMELY SLOW TRYING TO OPTIMIZE.
expanded_datascope = df_trans['cleanedattribution'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df_trans = pd.concat([df_trans, expanded_datascope], axis =1)

In [38]:
df_trans.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local,[{name,url,{name
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,...,"{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,San Joaquin Valley Unified APCD


In [39]:
df_trans.drop(' {name', axis=1, inplace=True)

In [41]:
df_trans.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,cleaneddate,cleanedattribution,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local,[{name,url
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,...,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...","{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,


In [42]:
#df_trans = pd.concat([df_trans, df_trans.cleanedaveragingperiod.apply(json.loads).apply(pd.Series)], axis = 1)
expanded_datascope = df_trans['cleanedaveragingperiod'].apply(ujson.loads)
expanded_datascope = pd.json_normalize(expanded_datascope)
df_trans = pd.concat([df_trans, expanded_datascope], axis =1)

In [43]:
df_trans.head()

Unnamed: 0,date,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,...,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local,[{name,url,unit.1,value.1
0,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,"{utc=2017-08-10T22:00:00.000Z, local=2017-08-1...",o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [44]:
df_trans.drop('date', axis =1, inplace=True)

In [45]:
df_trans.head()

Unnamed: 0,parameter,location,value,unit,city,attribution,averagingperiod,coordinates,country,sourcename,...,cleanedaveragingperiod,cleanedcoordinates,latitude,longitude,utc,local,[{name,url,unit.1,value.1
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.7056, longitude=-119.7414}",US,AirNow,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.7056"","" longitude"":""-119.7414""}",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.1994, longitude=-120.0991}",US,AirNow,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.1994"","" longitude"":""-120.0991""}",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.6324, longitude=-120.3822}",US,AirNow,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.6324"","" longitude"":""-120.3822""}",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,"[{name=US EPA AirNow, url=http://www.airnow.go...","{unit=hours, value=1.0}","{latitude=36.5975, longitude=-119.5036}",US,AirNow,...,"{""unit"":""hours"","" value"":""1.0""}","{""latitude"":""36.5975"","" longitude"":""-119.5036""}",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [46]:
df_trans.drop(['attribution','averagingperiod','coordinates','cleanedaveragingperiod','cleanedcoordinates'], axis = 1, inplace = True)

In [47]:
df_trans.head()

Unnamed: 0,parameter,location,value,unit,city,country,sourcename,sourcetype,mobile,cleaneddate,cleanedattribution,latitude,longitude,utc,local,[{name,url,unit.1,value.1
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,US,AirNow,government,False,"{""utc"":""2017-08-10T22:00:00.000Z"","" local"":""20...","{""[{name"":""US EPA AirNow"","" url"":""http://www.a...",36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [48]:
df_trans.drop(['cleaneddate','cleanedattribution'], axis = 1, inplace = True)

In [49]:
df_trans.head()

Unnamed: 0,parameter,location,value,unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,[{name,url,unit.1,value.1
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,US,AirNow,government,False,36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [50]:
#df_trans.rename(columns={df_trans.columns[15]: 'averaging time unit'}, inplace=True)
df_trans.columns.values[3]='concentration unit'

In [51]:
df_trans.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,[{name,url,unit,value.1
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,US,AirNow,government,False,36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [52]:
#df_trans.rename(columns={df_trans.columns[15]: 'averaging time unit'}, inplace=True)
df_trans.columns.values[15] = 'averaging time unit'

In [53]:
#df_trans.rename(columns={df_trans.columns[13]: 'source'}, inplace=True)
df_trans.columns.values[13] = 'source'

In [54]:
df_trans.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,averaging time unit,value.1
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,US,AirNow,government,False,36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [55]:
#df_trans.rename(columns={df_trans.columns[16]: 'averaging time'},inplace=True)
df_trans.columns.values[16] = 'averaging time'

In [56]:
df_trans.head()

Unnamed: 0,parameter,location,value,concentration unit,city,country,sourcename,sourcetype,mobile,latitude,longitude,utc,local,source,url,averaging time unit,averaging time
0,o3,Fresno - Drummond St,0.063,ppm,Fresno,US,AirNow,government,False,36.7056,-119.7414,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
1,pm25,Huron,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.1994,-120.0991,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
2,o3,Tranquillity,0.06,ppm,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
3,pm25,Tranquillity,21.0,µg/m³,FRESNO,US,AirNow,government,False,36.6324,-120.3822,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0
4,o3,Parlier,0.072,ppm,Fresno,US,AirNow,government,False,36.5975,-119.5036,2017-08-10T22:00:00.000Z,2017-08-10T14:00:00-08:00,US EPA AirNow,,hours,1.0


In [57]:
df_trans.to_csv('2017 AQ Clean.csv', index=False)