In [1]:
!pip install duckdb -q

[0m

In [2]:
import pandas as pd
import duckdb

In [3]:
weather = pd.read_csv('/kaggle/input/did-it-rain-in-seattle-19482017/seattleWeather_1948-2017.csv')

In [4]:
weather['DATE'] = pd.to_datetime(weather['DATE'])

## 1 - Select all rows from December 1st, 2000 to December 15th, 2000 (inclusive)

In [5]:
duckdb.query("""
            SELECT * FROM weather
            WHERE DATE BETWEEN '2000-12-01' AND '2000-12-15'
""").df()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,2000-12-01,0.04,55,39,True
1,2000-12-02,0.18,51,37,True
2,2000-12-03,0.0,44,34,False
3,2000-12-04,0.0,51,37,False
4,2000-12-05,0.0,50,36,False
5,2000-12-06,0.0,50,35,False
6,2000-12-07,0.0,40,34,False
7,2000-12-08,0.02,45,30,True
8,2000-12-09,0.06,43,36,True
9,2000-12-10,0.0,40,30,False


## 2 - Get the average maximum temperature for every year from the year 2000 onward. Order the results by year (ascending)

In [6]:
duckdb.query("""
           SELECT EXTRACT (YEAR FROM DATE) AS YEAR, AVG(TMAX) AS AVG_MAX_TEMPERATURE  
           FROM weather
           GROUP BY EXTRACT (YEAR FROM DATE)
           HAVING YEAR >= 2000
           ORDER BY YEAR
""").df()

Unnamed: 0,YEAR,AVG_MAX_TEMPERATURE
0,2000,58.674863
1,2001,58.473973
2,2002,58.893151
3,2003,60.441096
4,2004,60.622951
5,2005,60.175342
6,2006,61.038356
7,2007,59.20274
8,2008,58.494536
9,2009,59.912329


## 3 - Get the standard deviation of the maximum temperature per year, from 2000 onward. Order by year (ascending)

In [7]:
duckdb.query("""
           SELECT EXTRACT (YEAR FROM DATE) AS YEAR, STDDEV(TMAX) AS AVG_MAX_TEMPERATURE  
           FROM weather
           GROUP BY EXTRACT (YEAR FROM DATE)
           HAVING YEAR >= 2000
           ORDER BY YEAR
""").df()

Unnamed: 0,YEAR,AVG_MAX_TEMPERATURE
0,2000,11.4863
1,2001,11.175302
2,2002,12.306171
3,2003,12.872151
4,2004,12.611146
5,2005,11.887843
6,2006,13.048248
7,2007,12.917
8,2008,12.997101
9,2009,14.232874


## 4 - What are the 10 hottest days on record? Take hottest to mean 'highest maximum temperature'.

In [8]:
duckdb.query("""
           WITH t1 AS (SELECT DATE, PRCP, TMAX, TMIN, RAIN, DENSE_RANK() OVER (ORDER BY TMAX DESC) AS RNK FROM weather)
           SELECT DATE, PRCP, TMAX, TMIN, RAIN FROM t1
           WHERE RNK <=10
 """).df()

Unnamed: 0,DATE,PRCP,TMAX,TMIN,RAIN
0,2009-07-29,0.0,103,71,False
1,1994-07-20,0.0,100,65,False
2,1960-08-09,0.0,99,59,False
3,1981-08-09,0.0,99,68,False
4,1991-07-23,0.0,99,65,False
...,...,...,...,...,...
121,2015-07-03,0.0,92,64,False
122,2015-07-04,0.0,92,59,False
123,2015-07-18,0.0,92,64,False
124,2015-08-01,0.0,92,60,False


## 5 - In 2016, what fraction of days did it rain?

In [9]:
duckdb.query("""
            WITH t1 AS (SELECT DATE, EXTRACT (YEAR FROM DATE) AS YEAR, PRCP, TMAX, TMIN, RAIN, 
                        COUNT(CASE WHEN RAIN = 'True' THEN 1 END) AS RAIN_DAYS,
                        COUNT(CASE WHEN RAIN = 'False' THEN 1 END) AS NO_RAIN_DAYS,
                        FROM weather
                        GROUP BY DATE,PRCP, TMAX, TMIN, RAIN
                        HAVING YEAR = '2016')
                        
            SELECT CONCAT(ROUND(SUM(RAIN_DAYS) / (SUM(RAIN_DAYS) + SUM(NO_RAIN_DAYS)) * 100, 2), ' %') AS RAINY_DAYS_PCT FROM t1   
 """).df()

Unnamed: 0,RAINY_DAYS_PCT
0,46.99 %


## 6 - What is the 75th percentile for the amount of rain that fell on a day where there was some rain in 2016?

In [10]:
duckdb.query("""
            WITH t1 AS (SELECT DATE, EXTRACT(YEAR FROM DATE) AS YEAR, PRCP, TMAX, TMIN, RAIN, PERCENT_RANK() OVER(PARTITION BY YEAR ORDER BY PRCP DESC) AS PCT_RNK FROM weather
                        WHERE RAIN = 'True' and YEAR = '2016'
                        ORDER BY PCT_RNK)
            SELECT AVG(PRCP) AS RAIN_INCHES FROM t1
            WHERE PCT_RNK <= 0.75                
""").df()

Unnamed: 0,RAIN_INCHES
0,0.333609


## 7 - Get the 10 years with the hottest average maximum temperature in July. Order from hottest to coolest

In [72]:
duckdb.query("""
            SELECT EXTRACT (YEAR FROM DATE) AS YEAR, EXTRACT (MONTH FROM DATE) AS MONTH, AVG(TMAX) AS AVG_JULY_MAX_TEMP,
            FROM weather
            GROUP BY YEAR, MONTH
            HAVING MONTH = 7
            ORDER BY AVG_JULY_MAX_TEMP DESC                       
""").df()

Unnamed: 0,YEAR,MONTH,AVG_JULY_MAX_TEMP
0,2015,7,82.580645
1,1958,7,81.419355
2,2009,7,80.967742
3,1985,7,80.935484
4,2014,7,80.419355
...,...,...,...
65,1966,7,70.741935
66,1986,7,70.322581
67,1954,7,69.483871
68,1993,7,68.677419


## 8 - Get the 10 years with the coldest average minimum temperature in December. Order from coolest to hottest

In [77]:
duckdb.query("""
            SELECT EXTRACT (YEAR FROM DATE) AS YEAR, EXTRACT (MONTH FROM DATE) AS MONTH, AVG(TMIN) AS AVG_DEC_MIN_TEMP,
            FROM weather
            GROUP BY YEAR, MONTH
            HAVING MONTH = 12
            ORDER BY AVG_DEC_MIN_TEMP                        
""").df()

Unnamed: 0,YEAR,MONTH,AVG_DEC_MIN_TEMP
0,1990,12,30.387097
1,1948,12,30.806452
2,1985,12,30.935484
3,1951,12,31.225806
4,1964,12,31.483871
...,...,...,...
65,1958,12,40.064516
66,1973,12,40.225806
67,2014,12,40.258065
68,1950,12,40.419355


## 9 - Repeat the last question, but round the temperatures to 3 decimal places

In [79]:
duckdb.query("""
            SELECT EXTRACT (YEAR FROM DATE) AS YEAR, EXTRACT (MONTH FROM DATE) AS MONTH, ROUND(AVG(TMIN), 3) AS AVG_DEC_MIN_TEMP,
            FROM weather
            GROUP BY YEAR, MONTH
            HAVING MONTH = 12
            ORDER BY AVG_DEC_MIN_TEMP                        
""").df()

Unnamed: 0,YEAR,MONTH,AVG_DEC_MIN_TEMP
0,1990,12,30.387
1,1948,12,30.806
2,1985,12,30.935
3,1951,12,31.226
4,1964,12,31.484
...,...,...,...
65,1958,12,40.065
66,1973,12,40.226
67,2014,12,40.258
68,1950,12,40.419


## 10 - Given the results of the previous queries, would it be fair to use this data to claim that 2015 had the "hottest July on record"? Why or why not?

### I could say yes, from 1948 to 2017, the hottest year was 2015

In [102]:
duckdb.query("""
            SELECT EXTRACT (YEAR FROM DATE) AS YEAR, EXTRACT (MONTH FROM DATE) AS MONTH, AVG(TMAX) AS AVG_JULY_MAX_TEMP,
            FROM weather
            GROUP BY YEAR, MONTH
            HAVING MONTH = 7
            ORDER BY AVG_JULY_MAX_TEMP DESC
            LIMIT 1
""").df()

Unnamed: 0,YEAR,MONTH,AVG_JULY_MAX_TEMP
0,2015,7,82.580645


## 11 - Give the average inches of rain that fell per day for each month, where the average is taken over 2000 - 2010 (inclusive).

In [101]:
duckdb.query("""
            WITH t1 AS (SELECT EXTRACT (YEAR FROM DATE) AS YEAR, EXTRACT (MONTH FROM DATE) AS MONTH, PRCP
                        FROM weather
                        WHERE YEAR BETWEEN '2000' AND '2010')
            SELECT MONTH, AVG(PRCP) AS AVG_MONTHLY_INCHES_RAIN
            FROM t1
            GROUP BY MONTH
""").df()

Unnamed: 0,MONTH,AVG_MONTHLY_INCHES_RAIN
0,1,0.191613
1,2,0.094277
2,3,0.113578
3,4,0.085364
4,5,0.068035
5,6,0.050182
6,7,0.016129
7,8,0.03437
8,9,0.05693
9,10,0.115543
