In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mticks

import missingno as miss

import seaborn as sns
import numpy as np
# import statsmodels.tsa.api as tsa

pd.set_option('display.float_format',lambda x:f"{x:,.2f}")

In [2]:
sns.set_context('notebook',font_scale=0.9)
plt.style.use(['ggplot'])

In [3]:
# importing date and time library
# we will use datetime class as it is the most flexible
import datetime as dt

In [4]:
df = pd.read_csv("Data/london_weather_MODIFIED - london_weather_MODIFIED.csv")
df

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,19790101,2.00,7.00,52.00,2.30,-4.10,-7.50,0.40,101900.00,9.00
1,19790102,6.00,1.70,27.00,1.60,-2.60,-7.50,0.00,102530.00,8.00
2,19790103,5.00,0.00,13.00,1.30,-2.80,-7.20,0.00,102050.00,4.00
3,19790104,8.00,0.00,13.00,-0.30,-2.60,-6.50,0.00,100840.00,2.00
4,19790105,6.00,2.00,29.00,5.60,-0.80,-1.40,0.00,102250.00,1.00
...,...,...,...,...,...,...,...,...,...,...
15336,20201227,1.00,0.90,32.00,7.50,7.50,7.60,2.00,98000.00,
15337,20201228,7.00,3.70,38.00,3.60,1.10,-1.30,0.20,97370.00,
15338,20201229,7.00,0.00,21.00,4.10,2.60,1.10,0.00,98830.00,
15339,20201230,6.00,0.40,22.00,5.60,2.70,-0.10,0.00,100200.00,


In [5]:
df.dtypes

date                  int64
cloud_cover         float64
sunshine            float64
global_radiation    float64
max_temp            float64
mean_temp           float64
min_temp            float64
precipitation       float64
pressure            float64
snow_depth          float64
dtype: object

In [6]:
df.shape

(15341, 10)

In [8]:
df.head()

Unnamed: 0,date,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
0,19790101,2.0,7.0,52.0,2.3,-4.1,-7.5,0.4,101900.0,9.0
1,19790102,6.0,1.7,27.0,1.6,-2.6,-7.5,0.0,102530.0,8.0
2,19790103,5.0,0.0,13.0,1.3,-2.8,-7.2,0.0,102050.0,4.0
3,19790104,8.0,0.0,13.0,-0.3,-2.6,-6.5,0.0,100840.0,2.0
4,19790105,6.0,2.0,29.0,5.6,-0.8,-1.4,0.0,102250.0,1.0


# (Part 1) #1- Convert the date column to datetime dtype

In [10]:
# INCORRECT - does not parse dates correctly
pd.to_datetime(df['date'])

0       1970-01-01 00:00:00.019790101
1       1970-01-01 00:00:00.019790102
2       1970-01-01 00:00:00.019790103
3       1970-01-01 00:00:00.019790104
4       1970-01-01 00:00:00.019790105
                     ...             
15336   1970-01-01 00:00:00.020201227
15337   1970-01-01 00:00:00.020201228
15338   1970-01-01 00:00:00.020201229
15339   1970-01-01 00:00:00.020201230
15340   1970-01-01 00:00:00.020201231
Name: date, Length: 15341, dtype: datetime64[ns]

In [11]:
# CORRECT - properly recognizes dates and does not interpret them as seconds
df['date'] = pd.to_datetime(df['date'], format='%Y%m%d')
df['date']

0       1979-01-01
1       1979-01-02
2       1979-01-03
3       1979-01-04
4       1979-01-05
           ...    
15336   2020-12-27
15337   2020-12-28
15338   2020-12-29
15339   2020-12-30
15340   2020-12-31
Name: date, Length: 15341, dtype: datetime64[ns]

# (Part 1) #2 -Set the date column as the index

In [12]:
#Inspect the new index of your dataframe.
df = df.set_index('date')
df

Unnamed: 0_level_0,cloud_cover,sunshine,global_radiation,max_temp,mean_temp,min_temp,precipitation,pressure,snow_depth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1979-01-01,2.00,7.00,52.00,2.30,-4.10,-7.50,0.40,101900.00,9.00
1979-01-02,6.00,1.70,27.00,1.60,-2.60,-7.50,0.00,102530.00,8.00
1979-01-03,5.00,0.00,13.00,1.30,-2.80,-7.20,0.00,102050.00,4.00
1979-01-04,8.00,0.00,13.00,-0.30,-2.60,-6.50,0.00,100840.00,2.00
1979-01-05,6.00,2.00,29.00,5.60,-0.80,-1.40,0.00,102250.00,1.00
...,...,...,...,...,...,...,...,...,...
2020-12-27,1.00,0.90,32.00,7.50,7.50,7.60,2.00,98000.00,
2020-12-28,7.00,3.70,38.00,3.60,1.10,-1.30,0.20,97370.00,
2020-12-29,7.00,0.00,21.00,4.10,2.60,1.10,0.00,98830.00,
2020-12-30,6.00,0.40,22.00,5.60,2.70,-0.10,0.00,100200.00,


In [13]:
df.index

DatetimeIndex(['1979-01-01', '1979-01-02', '1979-01-03', '1979-01-04',
               '1979-01-05', '1979-01-06', '1979-01-07', '1979-01-08',
               '1979-01-09', '1979-01-10',
               ...
               '2020-12-22', '2020-12-23', '2020-12-24', '2020-12-25',
               '2020-12-26', '2020-12-27', '2020-12-28', '2020-12-29',
               '2020-12-30', '2020-12-31'],
              dtype='datetime64[ns]', name='date', length=15341, freq=None)

# (Part 1) #3- Filter out only the required data:

![image.png](attachment:d7657ce1-e5a1-49fd-8c07-30cbe81db653.png)

In [15]:
df = df.loc['2000':, ['precipitation', 'mean_temp', 'min_temp', 
                      'max_temp', 'snow_depth']]

df

Unnamed: 0_level_0,precipitation,mean_temp,min_temp,max_temp,snow_depth
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2000-01-01,0.00,7.00,4.90,10.80,0.00
2000-01-02,0.20,7.90,5.00,11.50,0.00
2000-01-03,6.00,9.40,7.20,9.50,0.00
2000-01-04,0.20,7.00,4.40,11.00,0.00
2000-01-05,0.80,6.40,1.90,10.80,0.00
...,...,...,...,...,...
2020-12-27,2.00,7.50,7.60,7.50,
2020-12-28,0.20,1.10,-1.30,3.60,
2020-12-29,0.00,2.60,1.10,4.10,
2020-12-30,0.00,2.70,-0.10,5.60,


# (Part 1)#4- Impute any missing values

In [16]:
df.isna().sum()

precipitation     390
mean_temp         419
min_temp          386
max_temp          390
snow_depth       1752
dtype: int64

In [17]:
## interpolating temperature measures
df['mean_temp'] = df['mean_temp'].interpolate()
df['max_temp'] = df['max_temp'].interpolate()
df['min_temp'] = df['min_temp'].interpolate()

In [18]:
df.isna().sum()

precipitation     390
mean_temp           0
min_temp            0
max_temp            0
snow_depth       1752
dtype: int64

In [19]:
# filling preciptation/snow with 0
df['precipitation'] = df['precipitation'].fillna(0)
df['snow_depth'] = df['snow_depth'].fillna(0)

In [20]:
df.isna().sum()

precipitation    0
mean_temp        0
min_temp         0
max_temp         0
snow_depth       0
dtype: int64

___

# Part 2)  
- Answer the Questions with Visualizations (Using the Correct Frequency)

![image.png](attachment:098ed000-8d80-4314-b74d-954dc6aa24e8.png)

___

# (Part 2) #Q1: What month had the most precipitation between 2000 through 2010?

![image.png](attachment:14bd2cdd-bdba-4029-9d5e-d697cdbe4cb3.png)

![image.png](attachment:26a83fca-0328-406f-bfea-e0db80ddfa0d.png)

![image.png](attachment:6f9a6fdf-9d80-47cb-8062-4882d9d10a15.png)

In [None]:
# NEED TO GO TO PRACTICES TO FINISH 

# (Part 2) #Q2: Which year between 2000-2020 had the coolest average temperature?

![image.png](attachment:88b76ba9-4846-4897-8296-320d4a74387f.png)

![image.png](attachment:527b3dab-809f-4d03-a879-d851e9e1ef8a.png)