# Weather forecasting

## 1. Overview

### 1.1 Description Project
(content)


In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
#import pandas_datareader as web
import datetime as dt
#import seaborn as sns



### 1.2 Dataset overview
link: https://www.kaggle.com/datasets/vanviethieuanh/vietnam-weather-data?fbclid=IwAR0wTzEzC_QOh10_F4su1WL9ccU70RFFIFaUuJkr-p7FszG3KS3t9AhbHT4 
(content)


In [None]:
BASE_DIR = os.getcwd()
data_path = os.path.join(BASE_DIR,'data','weather.csv')
df = pd.read_csv(data_path)
df.head()

In [None]:
df.info()

In [None]:
# number of province and the amount of entity on each   
provinces = df["province"].value_counts()
provinces

In [None]:
#amount of province
print(len(provinces),type(provinces))

In [None]:
province_list = provinces.index.tolist()

In [None]:
type(province_list)

## 1. Data Exploration


### 1.1 Extract weather data by province name 

In [None]:
def extract_data_by_province(df,province_name):
    province = df.query("province == @province_name")
    province = province.drop(columns="province")
    province = province.set_index("date").sort_index()
    return province

In [None]:
data_extracted = dict()

for province in province_list:
    data_extracted[province] = extract_data_by_province(df,province)

len(data_extracted)

In [None]:
print(data_extracted['Bac Lieu'])

### 1.2 Data visualizing

Data visualizing

In [None]:
index = data_extracted['Bac Lieu'].squeeze('index')

In [None]:
index

In [None]:
for name,data in data_extracted.items():
    print(data.describe())
    fig,(ax1,ax2,ax3) = plt.subplots(nrows=3)
    plt.figure(figsize=(10,5))
    plt.xlabel("date")
    ax1.plot()
    data["min"].plot()
    # data["humidi"].plot()
    # data["cloud"].plot()
    # data["pressure"].plot()
    plt.title("keep = " + str(name))
    plt.show()
    break

 

In [None]:
df.describe()

## Clean data

- Remove redundant columns
- Remove duplicates
- Handle missing values
- Convert data types to the correct data type

In [None]:
# Drop Unamed col
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
# Drop duplicated!
df = df.dropna(axis='columns',how='all')

In [None]:
def fill_nan_cell(df):
    for col in df.columns:
        df[col].fillna((df[col].mean()), inplace=True)
    return df

In [None]:
df = fill_nan_cell(df)

In [None]:
df.info()

## Visualization

In [None]:
df["Temparature"].plot(figsize=(10,5), title="Temparature")


In [None]:
df["Temparature"].plot(figsize=(10,5), title="Temparature")


## Standarlize

In [None]:
df["Temparature_std"] =( df["Temparature"] - df['Temparature'].mean() ) / df["Temparature"].std()

In [None]:
df["Temparature_std"].plot(figsize=(20,10))

## Wavelet Transform



<img src="/home/nghia/Work/code/DBM301/Assigment/illustration/The_Wavelet_Transform_for_Beginners.gif" style="width:1000px;height:700px;">

Reference:
- https://ataspinar.com/2018/12/21/a-guide-for-using-the-wavelet-transform-in-machine-learning/ 
- https://youtu.be/jnxqHcObNK4 
- https://youtu.be/eJLF9HeZA8I 

The wavelet transforms the data can be truncated and this is helpful in data reduction. If we store a small fraction of the strongest wavelet coefficients then the compressed approximation of the original data can be obtained. 

In [None]:
import pywt
signal = df["Temparature_std"].values.squeeze()
time = df.index.to_numpy()


In [None]:
print(signal.shape)

In [None]:

fig, ax = plt.subplots(figsize=(6,1))
ax.set_title("Original Signal: ")
ax.plot(signal)
plt.show()
    
data = signal
waveletname = 'db5'
 
fig, axarr = plt.subplots(nrows=5, ncols=2, figsize=(6,6))
for ii in range(5):
    (data, coeff_d) = pywt.dwt(data, waveletname)
    print(data.shape,coeff_d.shape)
    axarr[ii, 0].plot(data, 'r')
    axarr[ii, 1].plot(coeff_d, 'g')
    axarr[ii, 0].set_ylabel("Level {}".format(ii + 1), fontsize=14, rotation=90)
    axarr[ii, 0].set_yticklabels([])
    if ii == 0:
        axarr[ii, 0].set_title("Approximation coefficients", fontsize=14)
        axarr[ii, 1].set_title("Detail coefficients", fontsize=14)
    axarr[ii, 1].set_yticklabels([])
plt.tight_layout()
plt.show()

- The DWT return two sets of coefficients; the approximation coefficients and detail coefficients.
- The approximation coefficients represent the output of the low pass filter (averaging filter) of the DWT.
- The detail coefficients represent the output of the high pass filter (difference filter) of the DWT.

Length of coefficients arrays depends on the selected mode. For all modes except periodization:

len(cA) == len(cD) == floor((len(data) + wavelet.dec_len - 1) / 2)

For periodization mode (“per”):

len(cA) == len(cD) == ceil(len(data) / 2)

In [None]:
## wavelet Compression
level= 5
waveletname = "db5"

#show original
fig, ax = plt.subplots(figsize=(8,4))
ax.set_title("Original Signal: ")
ax.plot(time,signal)
plt.show()


coeffs = pywt.wavedec(signal, waveletname, level=level)
coeff_arr, coeff_slices = pywt.coeffs_to_array(coeffs)

Csort = np.sort(np.abs(coeff_arr.reshape(-1)))
print(Csort.shape)

for keep in (0.5, 0.1, 0.05, 0.001):
    #find desire threshold point 
    thresh = Csort[int(np.floor((1-keep)*len(Csort)))]
    ind = np.abs(coeff_arr) > thresh
    cfilt = coeff_arr * ind

    coeffs_filt = pywt.array_to_coeffs(cfilt,coeff_slices, output_format='wavedec')

    #plot reconstruction
    compress = pywt.waverec(coeffs_filt,wavelet=waveletname)
    plt.figure(figsize=(8,4))
    plt.plot(time,compress)
    plt.title("keep = " + str(keep))
    plt.show()

    print(len(Csort)-int(np.floor((1-keep)*len(Csort))))



In [None]:

(cA1, cD1) = pywt.dwt(signal, 'db2')
reconstructed_signal = pywt.idwt(cA1, cD1, 'db2', 'smooth')
 
fig, ax = plt.subplots(figsize=(20,10))
ax.plot(time,signal, label='signal')
ax.plot(time,reconstructed_signal, label='reconstructed signal', linestyle='--')
ax.legend(loc='upper left')
plt.show()