## Cleaning Data from Quebec Weather Stations

## Connect and set up SQL 

In [3]:
import sqlite3
con = sqlite3.connect("WEATHER.db") #create new database called Weather or connect to it if it already exists

cursor_obj = con.cursor() #create a cursor to fetch results from SQL queries

In [4]:
!pip install "sqlalchemy<2"

#allows use of sql magic command
%load_ext sql
#connects to database
%sql sqlite:///WEATHER.db


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: @WEATHER.db'

## Load data using Pandas

In [15]:
import csv, pandas as pd, numpy as np

df1 = pd.read_csv("/Users/maggiesullens/Library/Mobile Documents/com~apple~CloudDocs/Spruce Budworm/Retrieved weather/qc_weather.csv")
df1.rename(columns={"Unnamed: 0": "ID"}, inplace=True)
df1.to_sql("QC_WEATHER", con, if_exists='replace', index=False)


7805150

## Show glipse of data

In [18]:
%%sql 
SELECT *
FROM QC_WEATHER LIMIT 3;


 * sqlite:///WEATHER.db
Done.


ID,prov,station_name,station_id,lat,lon,elev,date,max_temp,mean_temp,min_temp,total_precip
1,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-01,21.1,13.4,5.6,0.0
2,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-02,16.1,13.1,10.0,8.6
3,QC,DALHOUSIE STATION,5199,45.3,-74.47,70.0,1974-09-03,15.6,12.5,9.4,3.3


## Make Daily Averages Dataset for Max Temp, Min Temp, Mean Temp, and Total Precip

In [185]:
%%sql
DROP TABLE IF EXISTS QC_DAY_AVERAGE_WEATHER;

CREATE TABLE QC_DAY_AVERAGE_WEATHER AS
SELECT 
  date,
  COUNT(*) AS group_size,
  ROUND(AVG(max_temp), 4) AS avg_max_temp,
  ROUND(AVG(min_temp), 4) AS avg_min_temp,
  ROUND(AVG(mean_temp), 4) AS avg_mean_temp,
  ROUND(AVG(total_precip), 4) AS avg_total_precip
FROM QC_WEATHER
GROUP BY date;
SELECT * FROM QC_DAY_AVERAGE_WEATHER ORDER BY date LIMIT 100;


 * sqlite:///WEATHER.db
Done.
Done.
Done.


date,group_size,avg_max_temp,avg_min_temp,avg_mean_temp,avg_total_precip
1871-07-01,1,28.3,13.3,20.8,0.0
1871-07-02,1,26.1,19.4,22.8,0.0
1871-07-03,1,28.9,11.7,20.3,0.0
1871-07-04,1,26.7,17.2,22.0,0.0
1871-07-05,1,27.8,16.7,22.3,19.6
1871-07-06,1,28.3,16.7,22.5,0.0
1871-07-07,1,23.3,20.6,22.0,19.1
1871-07-08,1,30.0,16.7,23.4,0.0
1871-07-09,1,26.7,21.1,23.9,1.0
1871-07-10,1,24.4,15.0,19.7,0.0


 * sqlite:///WEATHER.db
Done.


date_group,num_rows


## Check to see if there are any duplicate days

In [179]:
%%sql
SELECT date, COUNT(*) AS occurrences
FROM QC_DAY_AVERAGE_WEATHER
GROUP BY date
HAVING COUNT(*) > 1;

 * sqlite:///WEATHER.db
Done.


date,occurrences


## Save Average Dataset to CSV

In [180]:
query = "SELECT * FROM QC_DAY_AVERAGE_WEATHER"
data_frame = pd.read_sql(query, con)

csv_file_name = 'QC_daily_average_data.csv'
data_frame.to_csv(csv_file_name, index=False)
print(f"Data exported to '{csv_file_name}' successfully.")

Data exported to 'QC_daily_average_data.csv' successfully.


## Make Yearly Averages Dataset of the Daily Averages for Max Temp, Min Temp, Mean Temp, and Total Precip

In [186]:
%%sql
DROP TABLE IF EXISTS QC_YEARLY_DAY_AVERAGES_WEATHER;

CREATE TABLE QC_YEARLY_DAY_AVERAGES_WEATHER AS
SELECT 
  strftime('%Y', date) AS year,
  COUNT(*) AS group_size,
  ROUND(AVG(avg_max_temp), 2) AS year_avg_max_temp,
  ROUND(AVG(avg_min_temp), 2) AS year_avg_min_temp,
  ROUND(AVG(avg_mean_temp), 2) AS year_avg_mean_temp,
  ROUND(AVG(avg_total_precip), 2) AS year_avg_total_precip
FROM QC_DAY_AVERAGE_WEATHER
GROUP BY year;
SELECT * FROM QC_YEARLY_DAY_AVERAGES_WEATHER ORDER BY year LIMIT 10;

 * sqlite:///WEATHER.db
Done.
Done.
Done.


year,group_size,year_avg_max_temp,year_avg_min_temp,year_avg_mean_temp,year_avg_total_precip
1871,184,14.55,6.06,10.32,2.66
1872,366,11.95,2.44,7.2,2.87
1873,365,10.84,1.68,6.27,2.87
1874,365,9.59,0.71,5.16,2.69
1875,365,7.69,-1.54,3.08,2.74
1876,366,8.86,-0.25,4.31,2.6
1877,365,9.75,0.49,5.13,2.43
1878,365,10.04,1.12,5.59,2.88
1879,365,8.35,-1.48,3.44,2.75
1880,366,9.25,-0.96,4.15,2.5


In [187]:
query = "SELECT * FROM QC_YEARLY_DAY_AVERAGES_WEATHER"
data_frame = pd.read_sql(query, con)

csv_file_name = 'QC_yearly_daily_averages_data.csv'
data_frame.to_csv(csv_file_name, index=False)
print(f"Data exported to '{csv_file_name}' successfully.")

Data exported to 'QC_yearly_daily_averages_data.csv' successfully.
