## Cleaning Data from Quebec Weather Stations

## Connect and set up SQL 

In [3]:
import sqlite3
con = sqlite3.connect("WEATHER.db") #create new database called Weather or connect to it if it already exists

cursor_obj = con.cursor() #create a cursor to fetch results from SQL queries

In [4]:
!pip install "sqlalchemy<2"

#allows use of sql magic command
%load_ext sql
#connects to database
%sql sqlite:///WEATHER.db


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: @WEATHER.db'

## Load data using Pandas

In [15]:
import csv, pandas as pd, numpy as np

df1 = pd.read_csv("/Users/maggiesullens/Library/Mobile Documents/com~apple~CloudDocs/Spruce Budworm/Retrieved weather/qc_weather.csv")
df1.rename(columns={"Unnamed: 0": "ID"}, inplace=True)
df1.to_sql("QC_WEATHER", con, if_exists='replace', index=False)


7805150

## Show glipse of data

In [18]:
%%sql 
SELECT *
FROM QC_WEATHER LIMIT 3;


 * sqlite:///WEATHER.db
Done.


ID,prov,station_name,station_id,lat,lon,elev,date,max_temp,mean_temp,min_temp,total_precip
1,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-01,21.1,13.4,5.6,0.0
2,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-02,16.1,13.1,10.0,8.6
3,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-03,15.6,12.5,9.4,3.3


## Make Daily Averages Dataset for Max Temp, Min Temp, Mean Temp, and Total Precip

In [185]:
%%sql
DROP TABLE IF EXISTS QC_DAY_AVERAGE_WEATHER;

CREATE TABLE QC_DAY_AVERAGE_WEATHER AS
SELECT 
  date,
  COUNT(*) AS group_size,
  ROUND(AVG(max_temp), 4) AS avg_max_temp,
  ROUND(AVG(min_temp), 4) AS avg_min_temp,
  ROUND(AVG(mean_temp), 4) AS avg_mean_temp,
  ROUND(AVG(total_precip), 4) AS avg_total_precip
FROM QC_WEATHER
GROUP BY date;
SELECT * FROM QC_DAY_AVERAGE_WEATHER ORDER BY date LIMIT 100;


 * sqlite:///WEATHER.db
Done.
Done.
Done.


date,group_size,avg_max_temp,avg_min_temp,avg_mean_temp,avg_total_precip
1871-07-01,1,28.3,13.3,20.8,0.0
1871-07-02,1,26.1,19.4,22.8,0.0
1871-07-03,1,28.9,11.7,20.3,0.0
1871-07-04,1,26.7,17.2,22.0,0.0
1871-07-05,1,27.8,16.7,22.3,19.6
1871-07-06,1,28.3,16.7,22.5,0.0
1871-07-07,1,23.3,20.6,22.0,19.1
1871-07-08,1,30.0,16.7,23.4,0.0
1871-07-09,1,26.7,21.1,23.9,1.0
1871-07-10,1,24.4,15.0,19.7,0.0


 * sqlite:///WEATHER.db
Done.


date_group,num_rows


## Check to see if there are any duplicate days

In [179]:
%%sql
SELECT date, COUNT(*) AS occurrences
FROM QC_DAY_AVERAGE_WEATHER
GROUP BY date
HAVING COUNT(*) > 1;

 * sqlite:///WEATHER.db
Done.


date,occurrences


## Make Monthly Information

In [232]:
%%sql
DROP TABLE IF EXISTS QC_MONTHLY_WEATHER_VARS;

CREATE TABLE QC_MONTHLY_WEATHER_VARS AS
SELECT 
  strftime('%Y', date) AS year,
  strftime('%m', date) AS month,
  COUNT(*) AS group_size,
  ROUND(AVG(avg_max_temp), 2) AS month_avg_max_temp,
  ROUND(AVG(avg_min_temp), 2) AS month_avg_min_temp,
  ROUND(AVG(avg_mean_temp), 2) AS month_avg_mean_temp,
  MAX(avg_max_temp) AS month_max_temp,
  MIN(avg_min_temp) AS month_min_temp,
  ROUND(AVG(avg_max_temp - avg_min_temp), 2) AS mean_diurnal_range,
  ROUND(AVG(avg_total_precip), 2) AS month_avg_total_precip
FROM QC_DAY_AVERAGE_WEATHER
GROUP BY year, month;
SELECT * FROM QC_MONTHLY_WEATHER_VARS ORDER BY year, month LIMIT 10;

 * sqlite:///WEATHER.db
Done.
Done.
Done.


year,month,group_size,month_avg_max_temp,month_avg_min_temp,month_avg_mean_temp,month_max_temp,month_min_temp,mean_diurnal_range,month_avg_total_precip
1871,7,31,26.19,16.92,21.57,35.0,11.7,9.27,4.07
1871,8,31,26.51,17.02,21.79,32.2,11.7,9.48,3.08
1871,9,30,21.28,10.68,16.0,32.8,3.3,10.6,1.1
1871,10,31,14.08,6.4,10.26,28.3,-1.1,7.68,2.75
1871,11,30,3.52,-3.78,-0.12,11.1,-21.7,7.3,2.48
1871,12,31,-4.42,-11.04,-7.75,7.8,-30.6,6.61,2.43
1872,1,31,-3.11,-11.43,-7.28,5.6,-23.3,8.32,1.59
1872,2,29,-0.37,-12.1,-6.26,7.2,-18.9,11.74,2.87
1872,3,31,-0.47,-11.55,-6.02,11.1,-28.9,11.08,2.48
1872,4,30,13.32,1.11,7.23,26.7,-3.9,12.2,1.2


## Make Yearly variable Dataset

In [237]:
%%sql
DROP TABLE IF EXISTS QC_YEARLY_WEATHER;

CREATE TABLE QC_YEARLY_WEATHER AS
SELECT 
  strftime('%Y', date) AS year,
  COUNT(*) AS group_size,
  ROUND(AVG(avg_max_temp), 2) AS year_avg_max_temp,
  ROUND(AVG(avg_min_temp), 2) AS year_avg_min_temp,
  ROUND(AVG(avg_mean_temp), 2) AS year_avg_mean_temp,
  ROUND(AVG(avg_total_precip), 2) AS year_avg_total_precip
FROM QC_DAY_AVERAGE_WEATHER
GROUP BY year;

ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN max_temp_of_warmest_month REAL;
ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN min_temp_of_coldest_month REAL;
ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN temp_annual_range REAL;
ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN precip_wettest_month;
ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN precip_driest_month;
ALTER TABLE QC_YEARLY_WEATHER
ADD COLUMN isothermality;



UPDATE QC_YEARLY_WEATHER
SET 
  max_temp_of_warmest_month = (
    SELECT MAX(m.month_avg_max_temp)
    FROM QC_MONTHLY_WEATHER_VARS AS m
    WHERE m.year = QC_YEARLY_WEATHER.year
  ),
  min_temp_of_coldest_month = (
    SELECT MIN(m.month_avg_min_temp)
    FROM QC_MONTHLY_WEATHER_VARS AS m
    WHERE m.year = QC_YEARLY_WEATHER.year
  ),
  precip_wettest_month = (
    SELECT MAX(m.month_avg_total_precip)
    FROM QC_MONTHLY_WEATHER_VARS AS m
    WHERE m.year = QC_YEARLY_WEATHER.year
  ),
  precip_driest_month = (
    SELECT MIN(m.month_avg_total_precip)
    FROM QC_MONTHLY_WEATHER_VARS AS m
    WHERE m.year = QC_YEARLY_WEATHER.year
  );
UPDATE QC_YEARLY_WEATHER
SET 
  temp_annual_range = ROUND(max_temp_of_warmest_month - min_temp_of_coldest_month,2);

UPDATE QC_YEARLY_WEATHER
SET 
  isothermality = (
    SELECT ROUND((m.mean_diurnal_range/temp_annual_range)*100,2)
    FROM QC_MONTHLY_WEATHER_VARS AS m
    WHERE m.year = QC_YEARLY_WEATHER.year
  );

SELECT * FROM QC_YEARLY_WEATHER ORDER BY year LIMIT 10;

 * sqlite:///WEATHER.db
Done.
Done.
Done.
Done.
Done.
Done.
Done.
Done.
155 rows affected.
155 rows affected.
155 rows affected.
Done.


year,group_size,year_avg_max_temp,year_avg_min_temp,year_avg_mean_temp,year_avg_total_precip,max_temp_of_warmest_month,min_temp_of_coldest_month,temp_annual_range,precip_wettest_month,precip_driest_month,isothermality
1871,184,14.55,6.06,10.32,2.66,26.51,-11.04,37.55,4.07,1.1,24.69
1872,366,11.95,2.44,7.2,2.87,27.45,-14.11,41.56,5.43,1.2,20.02
1873,365,10.84,1.68,6.27,2.87,26.48,-13.45,39.93,5.44,1.16,18.31
1874,365,9.59,0.71,5.16,2.69,25.07,-14.72,39.79,4.43,1.35,16.56
1875,365,7.69,-1.54,3.08,2.74,24.64,-18.59,43.23,5.12,0.93,18.2
1876,366,8.86,-0.25,4.31,2.6,25.52,-15.85,41.37,3.71,1.43,19.39
1877,365,9.75,0.49,5.13,2.43,25.13,-18.2,43.33,3.79,1.18,21.42
1878,365,10.04,1.12,5.59,2.88,25.13,-16.48,41.61,4.05,1.01,22.71
1879,365,8.35,-1.48,3.44,2.75,23.32,-16.93,40.25,3.73,1.45,22.34
1880,366,9.25,-0.96,4.15,2.5,23.92,-14.57,38.49,4.37,1.43,31.15


## Add Degree Days to daily averages table

In [194]:
%%sql
ALTER TABLE QC_DAY_AVERAGE_WEATHER ADD COLUMN degree_days REAL;

 * sqlite:///WEATHER.db
(sqlite3.OperationalError) near "EXISTS": syntax error
[SQL: ALTER TABLE QC_DAY_AVERAGE_WEATHER
DROP COLUMN IF EXISTS degree_days;]
(Background on this error at: https://sqlalche.me/e/14/e3q8)


In [201]:
%%sql

UPDATE QC_DAY_AVERAGE_WEATHER
SET degree_days = ROUND(((avg_max_temp + avg_min_temp)/2)-8, 2);

SELECT * FROM QC_DAY_AVERAGE_WEATHER LIMIT 10

 * sqlite:///WEATHER.db
56166 rows affected.
Done.


date,group_size,avg_max_temp,avg_min_temp,avg_mean_temp,avg_total_precip,degree_days
1871-07-01,1,28.3,13.3,20.8,0.0,12.8
1871-07-02,1,26.1,19.4,22.8,0.0,14.75
1871-07-03,1,28.9,11.7,20.3,0.0,12.3
1871-07-04,1,26.7,17.2,22.0,0.0,13.95
1871-07-05,1,27.8,16.7,22.3,19.6,14.25
1871-07-06,1,28.3,16.7,22.5,0.0,14.5
1871-07-07,1,23.3,20.6,22.0,19.1,13.95
1871-07-08,1,30.0,16.7,23.4,0.0,15.35
1871-07-09,1,26.7,21.1,23.9,1.0,15.9
1871-07-10,1,24.4,15.0,19.7,0.0,11.7


## Save Average Dataset to CSV

In [202]:
query1 = "SELECT * FROM QC_DAY_AVERAGE_WEATHER"
data_frame1 = pd.read_sql(query1, con)

csv_file_name1 = 'QC_daily_average_data.csv'
data_frame1.to_csv(csv_file_name1, index=False)
print(f"Data exported to '{csv_file_name1}' successfully.")

# query2 = "SELECT * FROM QC_YEARLY_DAY_AVERAGES_WEATHER"
# data_frame2 = pd.read_sql(query2, con)

# csv_file_name2 = 'QC_yearly_daily_averages_data.csv'
# data_frame2.to_csv(csv_file_name2, index=False)
# print(f"Data exported to '{csv_file_name2}' successfully.")

Data exported to 'QC_daily_average_data.csv' successfully.
