In [1]:
import numpy as np
import pandas as pd

pm0 = pd.read_table('daily_88101_1999.csv', delimiter = ',') #read in PM2.5 data for 1999

In [2]:
print(pm0.info()) #provides basic information on data
print(pm0.ndim) #provides dimensions
print(pm0.shape) #provides number of rows and number of columns 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 103210 entries, 0 to 103209
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   State Code           103210 non-null  int64  
 1   County Code          103210 non-null  int64  
 2   Site Num             103210 non-null  int64  
 3   Parameter Code       103210 non-null  int64  
 4   POC                  103210 non-null  int64  
 5   Latitude             103210 non-null  float64
 6   Longitude            103210 non-null  float64
 7   Datum                103210 non-null  object 
 8   Parameter Name       103210 non-null  object 
 9   Sample Duration      103210 non-null  object 
 10  Pollutant Standard   103210 non-null  object 
 11  Date Local           103210 non-null  object 
 12  Units of Measure     103210 non-null  object 
 13  Event Type           103210 non-null  object 
 14  Observation Count    103210 non-null  int64  
 15  Observation Perce

In [3]:
#There are 103,210 rows and 29 columns. Because there are so many columns, look at the first five rows and first 10 columns

pm0.iloc[:5, :10]


Unnamed: 0,State Code,County Code,Site Num,Parameter Code,POC,Latitude,Longitude,Datum,Parameter Name,Sample Duration
0,1,27,1,88101,1,33.284928,-85.803608,NAD83,PM2.5 - Local Conditions,24 HOUR
1,1,27,1,88101,1,33.284928,-85.803608,NAD83,PM2.5 - Local Conditions,24 HOUR
2,1,27,1,88101,1,33.284928,-85.803608,NAD83,PM2.5 - Local Conditions,24 HOUR
3,1,27,1,88101,1,33.284928,-85.803608,NAD83,PM2.5 - Local Conditions,24 HOUR
4,1,27,1,88101,1,33.284928,-85.803608,NAD83,PM2.5 - Local Conditions,24 HOUR


In [4]:
#Look at the first five rows and next 9 columns
pm0.iloc[:5, 11:20]

Unnamed: 0,Date Local,Units of Measure,Event Type,Observation Count,Observation Percent,Arithmetic Mean,1st Max Value,1st Max Hour,AQI
0,1999-01-12,Micrograms/cubic meter (LC),,1,100.0,8.8,8.8,0,37
1,1999-01-15,Micrograms/cubic meter (LC),,1,100.0,14.9,14.9,0,57
2,1999-01-18,Micrograms/cubic meter (LC),,1,100.0,3.8,3.8,0,16
3,1999-01-21,Micrograms/cubic meter (LC),,1,100.0,9.0,9.0,0,38
4,1999-01-24,Micrograms/cubic meter (LC),,1,100.0,5.4,5.4,0,23


In [5]:
#Look at the first five rows and the final columns
pm0.iloc[:5, 21:]

Unnamed: 0,Method Name,Local Site Name,Address,State Name,County Name,City Name,CBSA Name,Date of Last Change
0,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIM...,ASHLAND,ASHLAND AIRPORT,Alabama,Clay,Ashland,,2014-06-11
1,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIM...,ASHLAND,ASHLAND AIRPORT,Alabama,Clay,Ashland,,2014-06-11
2,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIM...,ASHLAND,ASHLAND AIRPORT,Alabama,Clay,Ashland,,2014-06-11
3,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIM...,ASHLAND,ASHLAND AIRPORT,Alabama,Clay,Ashland,,2014-06-11
4,Andersen RAAS2.5-300 PM2.5 SEQ w/WINS - GRAVIM...,ASHLAND,ASHLAND AIRPORT,Alabama,Clay,Ashland,,2014-06-11


In [6]:
#The column of interest is the Arithmetic Mean. Extract that column and print a brief summary
x0 = pm0[["Arithmetic Mean"]].describe() 
x0

Unnamed: 0,Arithmetic Mean
count,103210.0
mean,13.776391
std,9.425582
min,0.0
25%,7.2
50%,11.5
75%,17.9
max,157.1


In [None]:
#Compare results of data and summary with Data Analysis Case Study: Changes in Fine Particle Air Pollution in the U.S.
#Dr. Peng used raw text files from the EPA. The 1999 PM2.5 dataset had 117,421 rows. The dataset parameters were also
#slightly different. The parameter used for analysis was Sample Value. 
#"The sample value is the average atmospheric concentration of the parameter in the time window beginning at the sample
#begin time and lasting for the sample duration." 
#Source:https://aqs.epa.gov/aqsweb/documents/about_aqs_data.html#_the_aqs_data_set
#No cleaning of data was described.

#summary(x0) for Dr. Peng's 1999 Sample Value
#Min.     0.00
#1st Qu.  7.20 
#Median  11.50
#Mean    13.74
#3rd Qu. 17.90 
#Max.   157.10
#NA's    13217

#My dataset was a csv file of data summarized on a daily basis. This 1999 PM2.5 dataset had 103,210 rows. 
#There were no NA values. The number of NA values in Dr. Peng's raw text file (13,217) combined with the number of rows
#in the summarized dataset that I used is 116,427.  According to the EPA AQS website: "historical monitoring or 
#calculation methods may be found to be problematic and require that older data be changed." The number of observations
#between these two datasets differed by 994 after adjusting for removed NA rows. It's not unreasonable to assume 
#those 994 observations were also removed from the summarized file.
#No cleaning of data was performed by me.

#The summary(x0) for my 1999 Arithmetic Mean closely matches Dr. Peng's 1999 Sample Value, with the exception there are
#no NA values in my dataset.
 


In [7]:
pm1 = pd.read_table('daily_88101_2012.csv', delimiter = ',') #read in PM2.5 data for 2012

In [8]:
print(pm1.info())
print(pm1.ndim) 
print(pm1.shape) 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 276671 entries, 0 to 276670
Data columns (total 29 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   State Code           276671 non-null  int64  
 1   County Code          276671 non-null  int64  
 2   Site Num             276671 non-null  int64  
 3   Parameter Code       276671 non-null  int64  
 4   POC                  276671 non-null  int64  
 5   Latitude             276671 non-null  float64
 6   Longitude            276671 non-null  float64
 7   Datum                276671 non-null  object 
 8   Parameter Name       276671 non-null  object 
 9   Sample Duration      276671 non-null  object 
 10  Pollutant Standard   191001 non-null  object 
 11  Date Local           276671 non-null  object 
 12  Units of Measure     276671 non-null  object 
 13  Event Type           276671 non-null  object 
 14  Observation Count    276671 non-null  int64  
 15  Observation Perce

In [9]:
#Again, the column of interest is the Arithmetic Mean. Extract the column and print a brief summary
x1 = pm1[["Arithmetic Mean"]].describe() 
x1

Unnamed: 0,Arithmetic Mean
count,276671.0
mean,9.141456
std,6.520117
min,-6.3125
25%,5.1
50%,7.916667
75%,11.7
max,236.254167


In [None]:
#Compare results of data and summary with Data Analysis Case Study: Changes in Fine Particle Air Pollution in the U.S.
#Dr. Peng used raw text files from the EPA. The 2012 PM2.5 dataset had 1,304,290 rows. The parameter used for analysis 
#was Sample Value. Dr. Peng discussed the unusual negative minimum value but made no changes to the dataset. 
#He also commented on the very elevated maximum value. No cleaning of data was described.

#summary(x0) for Dr. Peng's 2012 Sample Value
#Min.   -10.00
#1st Qu.  4.00 
#Median   7.63
#Mean     9.14
#3rd Qu. 12.00
#Max.   909.00
#NA's    73133


#The dataset I used was a csv file of data summarized on a daily basis. The 2012 PM2.5 dataset had 276,671 rows. 
#There were no NA values. The difference in the number of observations between the raw text file and the summarized csv 
#file is 1,027,619. I have no idea why. No cleaning of data was performed by me.  

#The summary(x0) for my 2012 Arithmetic Mean does not align as closely to Dr. Peng's as the 1999 comparison did.
#The means were exactly the same, though. Interestingly, the minimum value in my summary was also a negative value. 
#The maximum value in my summary was 236.25, a more reasonable value than 909. Presumably, the 909 observation row was
#removed from the summarized csv file.
