# Data

## Scrap Data 

Collect weather data for years 2009-2022 for our precise location from World weather Online API in JSON format and save them in a file.

Define Data scrapping parameters.

In [19]:
import os
import json
import requests
from calendar import monthrange

api_key = os.environ.get('WEATHER_API_KEY')
location_coordinates = os.environ.get('WEATHER_APP_LOCATION')
location_name = os.environ.get('WEATHER_APP_LOCATION_NAME')
start_year = 2009
end_year = 2022

Loop through GET API calls to database to get all necessary data.
Save all .json files corresponding to each month in a 'weather_data' directory.

In [24]:
# Create the 'weather_data' directory if it does not already exist
if not os.path.exists('weather_data'):
    os.makedirs('weather_data')

raw_data = 'weather_data'

with open(raw_data, 'w') as combined_data_file:
    # Loop through all years and months
    for year in range(start_year, end_year+1):
        for month in range(1, 13):
            # Get the start and end dates for the month
            _, num_days = monthrange(year, month) 
            start_date = f'{year}-{month:02d}-01'
            end_date = f'{year}-{month:02d}-{num_days:02d}'
            
            # Construct the URL and make the request
            url = f'https://api.worldweatheronline.com/premium/v1/past-weather.ashx?key={api_key}&q={location_coordinates}&format=json&date={start_date}&enddate={end_date}'
            response = requests.get(url)
            data = response.json()
            
            # Get the filename and filepath for this month's data
            filename = f'{location_name}_{year}-{month:02d}.json'
            filepath = os.path.join('weather_data', filename)
            
            # Check if file exists and create it if it does not
            if not os.path.exists(filepath):
                with open(filepath, 'w') as f:
                    json.dump(data, f)
            else:
                print(f'File {filename} already exists.')


IsADirectoryError: [Errno 21] Is a directory: 'weather_data'

## Filter Data

Filter all .json files for the data, time, temperature, humidity, pressure, and weather description keys and create new parsed .json files in a new directory called 'parsed_weather_data'.

In [33]:

# create the directory for the parsed weather data if it doesn't exist
if not os.path.exists('parsed_weather_data'):
    os.makedirs('parsed_weather_data')

# loop through all the JSON files in the weather_data directory
for filename in os.listdir('weather_data'):
    if filename.endswith('.json'):
        # extract the location and month from the filename
        location_name = filename.split('_')[0]
        month = filename.split('_')[1].split('.')[0]

        # create a new dictionary to store the parsed data for this month
        parsed_month_data = {}

        # Open the file and load the JSON data
        with open(f'weather_data/{filename}', 'r') as f:
            file_contents = f.read()
            parsed_data = json.loads(file_contents)

        # loop through the hourly data for all the dates in the JSON file
        for weather_data in parsed_data['data']['weather']:
            # extract the date for this set of hourly data
            date = weather_data['date']

            # create a new list to store the parsed data for this day
            parsed_day_data = []

            # loop through the hourly data for this date
            for hourly_data in weather_data['hourly']:
                # extract the values we're interested in
                tempC = hourly_data['tempC']
                humidity = hourly_data['humidity']
                pressure = hourly_data['pressure']
                weatherCond = hourly_data['weatherDesc'][0]['value']
                time = hourly_data['time']

                # add the values to the parsed_day_data list
                parsed_day_data.append({
                    'time': time,
                    'tempC': tempC,
                    'humidity': humidity,
                    'pressure': pressure,
                    'weatherCond': weatherCond
                })

            # add the parsed day data to the parsed_month_data dictionary
            parsed_month_data[date] = parsed_day_data

        # write the parsed_month_data to a new JSON file
        with open(f'parsed_weather_data/{location_name}_{month}_parsed.json', 'w') as f:
            json.dump(parsed_month_data, f)


Combine all parsed .json files into a single .json file containing all data of interest.

In [34]:
# create the combined data dictionary
combined_data = {}

# loop over each file in the directory
for filename in os.listdir('parsed_weather_data'):
    if filename.endswith('.json'):
        # load the contents of the file into a dictionary
        with open(os.path.join('parsed_weather_data', filename)) as f:
            parsed_data = json.load(f)
        
        # loop over each date in the parsed data dictionary
        for date in parsed_data.keys():
            # if the date doesn't exist in the combined data dictionary, add it with an empty list as its value
            if date not in combined_data:
                combined_data[date] = []
            
            # add the weather data to the combined_data dictionary
            combined_data[date].extend(parsed_data[date])

# save the combined data to a new file
with open('combined_parsed_data.json', 'w') as f:
    json.dump(combined_data, f)


Convert .json data into .csv data.

In [50]:


csv_data_file = 'weather_data.csv'

# read the weather data from the JSON file
with open('combined_parsed_data.json', 'r') as f:
    weather_data = json.load(f)

# create a new .csv filew if it doesn't exist.
if not os.path.isfile(csv_data_file):
    with open(csv_data_file, "x") as f:
        pass

# write the weather data to a CSV file
with open('weather_data.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    
    # write the header row
    writer.writerow(['date', 'time', 'tempC', 'humidity', 'pressure', 'weatherCond'])
    
    # write the data rows
    for date in weather_data:
        for data in weather_data[date]:
            writer.writerow([date, data['time'], data['tempC'], data['humidity'], data['pressure'], data['weatherCond']])
