In [96]:
try:
    from bs4 import BeautifulSoup
    import pandas
    print ("BeautifulSoup and Pandas are already installed and imported")
except:
    import sys
    !conda install --yes --prefix {sys.prefix} bs4
    !conda install --yes --prefix {sys.prefix} pandas
    from bs4 import BeautifulSoup
    import pandas
    print ("BeautifulSoup and Pandas were not found. Installed them and imported")

import requests

BeautifulSoup and Pandas are already installed and imported


In [97]:
opened_webpage = requests.get("https://www.estesparkweather.net/archive_reports.php?date=200601")
print("Webpage opened successfully. . . ")

bs = BeautifulSoup(opened_webpage.content, "html.parser")
print("Webpage laoded and parsed successfully . . .")


Webpage opened successfully. . . 
Webpage laoded and parsed successfully . . .


## Extract

Notes: 
- HTML table: 
  made up of header and data cells 
      - <th> defines a header cell
      - <td> tag defines a standard data cell in an HTML table
    

In [98]:
raw_data = []
table = bs.find_all("table")

for row in table: 
    line = row.text 
    raw_data.append(line)
    
print(raw_data)

['\n\nJan 1 Average and Extremes\n\n\nAverage temperature36.2   °F (2.4 °C)\n\n\nAverage humidity44   %\n\n\nAverage dewpoint15.9   °F (-8.9 °C)\n\n\nAverage barometer29.491 in.\n\n\nAverage windspeed13.2   mph\n\n\nAverage gustspeed18.1   mph\n\n\nAverage direction250 ° (WSW)\n\n\nRainfall for month0.000  in. ( 0 mm)\n\n\nRainfall for year0.000  in. ( 0 mm)\n\n\nMaximum rain per minute0.000  in. ( 0 mm) on day 01 at time 00:06\n\n\nMaximum temperature41.5  °F (5.3 °C) on day 01 at time 00:09\n\n\nMinimum temperature29.7  °F (-1.3 °C) on day 01 at time 19:15\n\n\nMinimum pressure29.2  in. on day 01 at time 06:29\n\n\nMaximum pressure29.8  in. on day 01 at time 23:16\n\n\nMaximum windspeed34.0  mph from 254on day 01 at time 23:32\n\n\nMaximum gust speed45.0  mph from 254 on day 01 at time 23:32\n\n\nMaximum humidity62 % on day 01 at time 05:47\n\n\nMinimum humidity31 % on day 01 at time 21:46\n\n\nMaximum heat index41.5 °F (5.3 °C)on day 00 at time 00:09\n\n', '\n\nJan 2 Average and Ext

## Transform

Notes: 
- CSV file format

In [99]:
column_names = ["Average and Extremes", "Average temperature",
           "Average humidity","Average dewpoint",
           "Average barometer","Average windspeed",
           "Average gustspeed","Average direction",
           "Rainfall for month","Rainfall for year",
           "Maximum rain per minute","Maximum temperature",
           "Minimum temperature","Maximum humidity",
           "Minimum humidity","Maximum pressure",
           "Minimum pressure","Maximum windspeed",
           "Maximum gust speed","Maximum heat index"]

final_data = []

for l in raw_data:
    #print(l)
    entries = l.split("\n")
    row = {} # empty dictionary for every row
    for entry in entries:
        for column_name in column_names:
            if column_name in entry:
                entry = entry.replace(column_name,"")
                row[column_name] = entry
                #print("----", entry)
                break # stops the inner loop here because we already find a match
    final_data.append(row)
    
print(final_data[:5])


[{'Average and Extremes': 'Jan 1 ', 'Average temperature': '36.2   °F (2.4 °C)', 'Average humidity': '44   %', 'Average dewpoint': '15.9   °F (-8.9 °C)', 'Average barometer': '29.491 in.', 'Average windspeed': '13.2   mph', 'Average gustspeed': '18.1   mph', 'Average direction': '250 ° (WSW)', 'Rainfall for month': '0.000  in. ( 0 mm)', 'Rainfall for year': '0.000  in. ( 0 mm)', 'Maximum rain per minute': '0.000  in. ( 0 mm) on day 01 at time 00:06', 'Maximum temperature': '41.5  °F (5.3 °C) on day 01 at time 00:09', 'Minimum temperature': '29.7  °F (-1.3 °C) on day 01 at time 19:15', 'Minimum pressure': '29.2  in. on day 01 at time 06:29', 'Maximum pressure': '29.8  in. on day 01 at time 23:16', 'Maximum windspeed': '34.0  mph from 254on day 01 at time 23:32', 'Maximum gust speed': '45.0  mph from 254 on day 01 at time 23:32', 'Maximum humidity': '62 % on day 01 at time 05:47', 'Minimum humidity': '31 % on day 01 at time 21:46', 'Maximum heat index': '41.5 °F (5.3 °C)on day 00 at time

#### Convert to Data Frame
- DataFrame = data structure, organized data into 2D table with rows & columns 
- common data structure bc flexible and intuitive

- Use **Pandas** to convert data into dataframe

In [100]:
final_data = pandas.DataFrame(final_data)

#Print a few elements in the dataframe

final_data.head()

Unnamed: 0,Average and Extremes,Average temperature,Average humidity,Average dewpoint,Average barometer,Average windspeed,Average gustspeed,Average direction,Rainfall for month,Rainfall for year,Maximum rain per minute,Maximum temperature,Minimum temperature,Minimum pressure,Maximum pressure,Maximum windspeed,Maximum gust speed,Maximum humidity,Minimum humidity,Maximum heat index
0,Jan 1,36.2 °F (2.4 °C),44 %,15.9 °F (-8.9 °C),29.491 in.,13.2 mph,18.1 mph,250 ° (WSW),0.000 in. ( 0 mm),0.000 in. ( 0 mm),0.000 in. ( 0 mm) on day 01 at time 00:06,41.5 °F (5.3 °C) on day 01 at time 00:09,29.7 °F (-1.3 °C) on day 01 at time 19:15,29.2 in. on day 01 at time 06:29,29.8 in. on day 01 at time 23:16,34.0 mph from 254on day 01 at time 23:32,45.0 mph from 254 on day 01 at time 23:32,62 % on day 01 at time 05:47,31 % on day 01 at time 21:46,41.5 °F (5.3 °C)on day 00 at time 00:09
1,Jan 2,40.0 °F (4.4 °C),35 %,14.1 °F (-10.0 °C),29.800 in.,16.0 mph,21.5 mph,251 ° (WSW),0.000 in. ( 0 mm),0.000 in. ( 0 mm),0.000 in. ( 0 mm) on day 02 at time 00:06,47.0 °F (8.3 °C) on day 02 at time 14:29,31.3 °F (-0.4 °C) on day 01 at time 00:09,29.7 in. on day 02 at time 13:01,29.8 in. on day 02 at time 18:31,43.0 mph from 256on day 02 at time 11:48,56.0 mph from 256 on day 02 at time 11:48,51 % on day 02 at time 00:06,22 % on day 02 at time 06:51,47.0 °F (8.3 °C)on day 02 at time 14:29
2,Jan 4,33.5 °F (0.8 °C),48 %,15.3 °F (-9.3 °C),30.039 in.,12.7 mph,17.3 mph,252 ° (WSW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.010 in. ( 0 mm) on day 03 at time 18:07,43.3 °F (6.3 °C) on day 03 at time 03:22,23.8 °F (-4.6 °C) on day 04 at time 23:57,29.6 in. on day 03 at time 04:16,30.5 in. on day 04 at time 23:16,39.0 mph from 259on day 03 at time 17:58,62.0 mph from 254 on day 03 at time 17:56,89 % on day 03 at time 08:17,33 % on day 03 at time 03:12,43.3 °F (6.3 °C)on day 03 at time 03:22
3,Jan 5,34.4 °F (1.3 °C),31 %,6.2 °F (-14.3 °C),30.520 in.,15.5 mph,19.3 mph,228 ° ( SW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.000 in. ( 0 mm) on day 05 at time 00:06,43.1 °F (6.2 °C) on day 05 at time 13:29,23.2 °F (-4.9 °C) on day 04 at time 00:48,30.3 in. on day 05 at time 23:46,30.6 in. on day 05 at time 09:01,30.0 mph from 215on day 05 at time 02:29,37.0 mph from 231 on day 05 at time 02:24,53 % on day 04 at time 00:21,22 % on day 05 at time 11:12,43.1 °F (6.2 °C)on day 05 at time 13:29
4,Jan 6,47.0 °F (8.3 °C),24 %,11.8 °F (-11.2 °C),30.102 in.,17.3 mph,24.6 mph,254 ° (WSW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.000 in. ( 0 mm) on day 06 at time 00:06,54.5 °F (12.5 °C) on day 06 at time 13:24,37.9 °F (3.3 °C) on day 06 at time 01:39,29.8 in. on day 06 at time 23:46,30.3 in. on day 05 at time 00:16,31.0 mph from 266on day 06 at time 10:00,43.0 mph from 247 on day 06 at time 18:15,37 % on day 06 at time 01:40,15 % on day 06 at time 00:03,54.5 °F (12.5 °C)on day 06 at time 13:24


After transformation - additional cleaning or store

Basic Cleaning: 
- remove duplicate rows, if any
- replace NULL data entries with default value, or remove row

#### Duplication Checks and Cleaning

In [101]:
number_of_duplicates = final_data.duplicated().sum()
print(f"Number of duplicaes before : {number_of_duplicates}") 

#Delete duplicate rows 
final_data = final_data.drop_duplicates()

number_of_duplicates = final_data.duplicated().sum()
print (f" Number of duplicates after removing : {number_of_duplicates}")

Number of duplicaes before : 7
 Number of duplicates after removing : 0


#### Inspecting Data and Checking Noisy Entries

In [102]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Average and Extremes     24 non-null     object
 1   Average temperature      24 non-null     object
 2   Average humidity         24 non-null     object
 3   Average dewpoint         24 non-null     object
 4   Average barometer        24 non-null     object
 5   Average windspeed        24 non-null     object
 6   Average gustspeed        24 non-null     object
 7   Average direction        24 non-null     object
 8   Rainfall for month       24 non-null     object
 9   Rainfall for year        24 non-null     object
 10  Maximum rain per minute  24 non-null     object
 11  Maximum temperature      24 non-null     object
 12  Minimum temperature      24 non-null     object
 13  Minimum pressure         24 non-null     object
 14  Maximum pressure         24 non-null     obj

25 total, 24 non-null

Delete row with NULL entries

In [103]:
final_data = final_data.dropna()
final_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24 entries, 0 to 23
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Average and Extremes     24 non-null     object
 1   Average temperature      24 non-null     object
 2   Average humidity         24 non-null     object
 3   Average dewpoint         24 non-null     object
 4   Average barometer        24 non-null     object
 5   Average windspeed        24 non-null     object
 6   Average gustspeed        24 non-null     object
 7   Average direction        24 non-null     object
 8   Rainfall for month       24 non-null     object
 9   Rainfall for year        24 non-null     object
 10  Maximum rain per minute  24 non-null     object
 11  Maximum temperature      24 non-null     object
 12  Minimum temperature      24 non-null     object
 13  Minimum pressure         24 non-null     object
 14  Maximum pressure         24 non-null     obj

**[Additional Transformations]** 
Remove unnecessary strings ("F") and "%" symbol -> convert to int/float

In [104]:
def clean_string_and_convert(s):
    s = s.replace("%", "")
    s = s.replace("°F", "")
    s = s.replace("in.", "")
    s = s.replace("mph", "")
    s = s.split(" ")[0]
    
    converted = float(s)
    return converted

final_data["Average temperature"] = final_data["Average temperature"].apply(clean_string_and_convert)
final_data["Average humidity"] = final_data["Average humidity"].apply(clean_string_and_convert)
final_data["Average dewpoint"] = final_data["Average dewpoint"].apply(clean_string_and_convert)
final_data["Average barometer"] = final_data["Average barometer"].apply(clean_string_and_convert)
final_data["Average windspeed"] = final_data["Average windspeed"].apply(clean_string_and_convert)
final_data["Average gustspeed"] = final_data["Average gustspeed"].apply(clean_string_and_convert)

final_data.head()


Unnamed: 0,Average and Extremes,Average temperature,Average humidity,Average dewpoint,Average barometer,Average windspeed,Average gustspeed,Average direction,Rainfall for month,Rainfall for year,Maximum rain per minute,Maximum temperature,Minimum temperature,Minimum pressure,Maximum pressure,Maximum windspeed,Maximum gust speed,Maximum humidity,Minimum humidity,Maximum heat index
0,Jan 1,36.2,44.0,15.9,29.491,13.2,18.1,250 ° (WSW),0.000 in. ( 0 mm),0.000 in. ( 0 mm),0.000 in. ( 0 mm) on day 01 at time 00:06,41.5 °F (5.3 °C) on day 01 at time 00:09,29.7 °F (-1.3 °C) on day 01 at time 19:15,29.2 in. on day 01 at time 06:29,29.8 in. on day 01 at time 23:16,34.0 mph from 254on day 01 at time 23:32,45.0 mph from 254 on day 01 at time 23:32,62 % on day 01 at time 05:47,31 % on day 01 at time 21:46,41.5 °F (5.3 °C)on day 00 at time 00:09
1,Jan 2,40.0,35.0,14.1,29.8,16.0,21.5,251 ° (WSW),0.000 in. ( 0 mm),0.000 in. ( 0 mm),0.000 in. ( 0 mm) on day 02 at time 00:06,47.0 °F (8.3 °C) on day 02 at time 14:29,31.3 °F (-0.4 °C) on day 01 at time 00:09,29.7 in. on day 02 at time 13:01,29.8 in. on day 02 at time 18:31,43.0 mph from 256on day 02 at time 11:48,56.0 mph from 256 on day 02 at time 11:48,51 % on day 02 at time 00:06,22 % on day 02 at time 06:51,47.0 °F (8.3 °C)on day 02 at time 14:29
2,Jan 4,33.5,48.0,15.3,30.039,12.7,17.3,252 ° (WSW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.010 in. ( 0 mm) on day 03 at time 18:07,43.3 °F (6.3 °C) on day 03 at time 03:22,23.8 °F (-4.6 °C) on day 04 at time 23:57,29.6 in. on day 03 at time 04:16,30.5 in. on day 04 at time 23:16,39.0 mph from 259on day 03 at time 17:58,62.0 mph from 254 on day 03 at time 17:56,89 % on day 03 at time 08:17,33 % on day 03 at time 03:12,43.3 °F (6.3 °C)on day 03 at time 03:22
3,Jan 5,34.4,31.0,6.2,30.52,15.5,19.3,228 ° ( SW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.000 in. ( 0 mm) on day 05 at time 00:06,43.1 °F (6.2 °C) on day 05 at time 13:29,23.2 °F (-4.9 °C) on day 04 at time 00:48,30.3 in. on day 05 at time 23:46,30.6 in. on day 05 at time 09:01,30.0 mph from 215on day 05 at time 02:29,37.0 mph from 231 on day 05 at time 02:24,53 % on day 04 at time 00:21,22 % on day 05 at time 11:12,43.1 °F (6.2 °C)on day 05 at time 13:29
4,Jan 6,47.0,24.0,11.8,30.102,17.3,24.6,254 ° (WSW),0.070 in. ( 2 mm),0.070 in. ( 2 mm),0.000 in. ( 0 mm) on day 06 at time 00:06,54.5 °F (12.5 °C) on day 06 at time 13:24,37.9 °F (3.3 °C) on day 06 at time 01:39,29.8 in. on day 06 at time 23:46,30.3 in. on day 05 at time 00:16,31.0 mph from 266on day 06 at time 10:00,43.0 mph from 247 on day 06 at time 18:15,37 % on day 06 at time 01:40,15 % on day 06 at time 00:03,54.5 °F (12.5 °C)on day 06 at time 13:24


**[Describe Columns]**

In [105]:
final_data["Average temperature"].describe()

count    24.000000
mean     30.712500
std       7.885833
min      16.700000
25%      25.550000
50%      30.750000
75%      35.000000
max      47.000000
Name: Average temperature, dtype: float64

**[Plot Histogram of Column]**

- store datafrma table to local disk 
- convert to csv using **pandas**

In [1]:
final_data.to_csv("EstesPark_Weather_January_2023.csv")

NameError: name 'final_data' is not defined

E1: 
- Jan 2023 (24.9°F) had a lower average temperature than Jan 2006 (30°F). Also, the maximum temperature in Jan 2023 was lower (42°F) than in 2006 (47°F), in conclusion the data does not show global warming.

github: https://github.com/mk43275/minji_I310D
