In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
#LOADING THE DATA
weather_forecast_df = pd.read_csv("../raw_data/weather_forecast_2018_2023.csv")

In [3]:
#DROPPING UNNECESSARY COLUMNS
forecast_cleaned_df = weather_forecast_df.drop(columns=["convective","slice dt unixtime","lat","lon","dew_point","snow_depth","snow","pressure","ground_pressure","ice","fr_rain"])

In [4]:
#FORMATTING THE DATE COLUMNS
forecast_cleaned_df['forecast dt iso'] = forecast_cleaned_df['forecast dt iso'].str[:19]
forecast_cleaned_df['slice dt iso'] = forecast_cleaned_df['slice dt iso'].str[:19]
forecast_cleaned_df["forecast dt iso"] = pd.to_datetime(forecast_cleaned_df["forecast dt iso"],utc=True)
forecast_cleaned_df["slice dt iso"] = pd.to_datetime(forecast_cleaned_df["slice dt iso"],utc=True)

In [5]:
#ROUNDING THE FLOATS AND TURNING THEM TO INTEGERS
forecast_cleaned_df["clouds"] = forecast_cleaned_df["clouds"].round().astype(int)
forecast_cleaned_df["humidity"] = forecast_cleaned_df["humidity"].round().astype(int)
forecast_cleaned_df["wind_deg"] = forecast_cleaned_df["wind_deg"].round().astype(int)

In [6]:
#SPLITTING THE DATASET BASED ON THE PREDICTION STEPS
df_midnight_predictions = forecast_cleaned_df[forecast_cleaned_df["forecast dt iso"].dt.time == pd.Timestamp("00:00:00+00:00").time()]
df_midnight_predictions


Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt iso,temperature,humidity,clouds,wind_speed,wind_deg,rain,accumulated,hours,rate,probability
0,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 00:00:00+00:00,9.07,92,0,3.78,278,0.0,0.0,1.0,0.000000,0.00
1,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 01:00:00+00:00,9.49,89,33,4.64,281,0.0,0.0,1.0,0.000000,0.05
2,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 02:00:00+00:00,9.64,88,54,5.09,280,0.0,0.0,1.0,0.000000,0.04
3,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 03:00:00+00:00,9.63,89,64,5.25,276,0.0,0.0,1.0,0.000000,0.00
4,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 04:00:00+00:00,9.61,90,68,5.26,272,0.0,0.0,1.0,0.000008,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618504,1672444800,2022-12-31 00:00:00+00:00,2023-01-15 20:00:00+00:00,1.92,76,45,4.96,271,0.0,0.0,1.0,0.000000,0.00
2618505,1672444800,2022-12-31 00:00:00+00:00,2023-01-15 21:00:00+00:00,1.57,77,19,5.20,269,0.0,0.0,1.0,0.000000,0.00
2618506,1672444800,2022-12-31 00:00:00+00:00,2023-01-15 22:00:00+00:00,1.30,77,0,5.45,268,0.0,0.0,1.0,0.000000,0.00
2618507,1672444800,2022-12-31 00:00:00+00:00,2023-01-15 23:00:00+00:00,1.12,76,0,5.68,269,0.0,0.0,1.0,0.000000,0.00


In [7]:
# 1-Filter the rows between 8 am and 10 pm
df_midnight_predictions['slice dt iso'] = pd.to_datetime(df_midnight_predictions['slice dt iso'], format='%Y-%m-%d %H:%M:%S')
df_filtered = df_midnight_predictions[(df_midnight_predictions['slice dt iso'].dt.hour >= 8) & (df_midnight_predictions['slice dt iso'].dt.hour <= 22)]
df_filtered.head(17)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_midnight_predictions['slice dt iso'] = pd.to_datetime(df_midnight_predictions['slice dt iso'], format='%Y-%m-%d %H:%M:%S')


Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt iso,temperature,humidity,clouds,wind_speed,wind_deg,rain,accumulated,hours,rate,probability
8,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 08:00:00+00:00,11.52,75,82,5.95,266,0.13,0.13,1.0,3.6e-05,0.24
9,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 09:00:00+00:00,12.22,69,88,6.35,266,0.12,0.12,1.0,3.3e-05,0.3
10,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 10:00:00+00:00,12.63,65,91,6.67,263,0.11,0.11,1.0,3.1e-05,0.26
11,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 11:00:00+00:00,12.78,64,93,6.88,257,0.0,0.0,1.0,2.5e-05,0.19
12,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 12:00:00+00:00,12.74,63,94,6.95,250,0.0,0.0,1.0,2.2e-05,0.21
13,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 13:00:00+00:00,12.58,64,96,6.88,242,0.0,0.0,1.0,2.5e-05,0.4
14,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 14:00:00+00:00,12.29,65,98,6.8,235,0.12,0.12,1.0,3.3e-05,0.67
15,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 15:00:00+00:00,11.84,67,100,6.87,230,0.19,0.19,1.0,5.3e-05,0.9
16,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 16:00:00+00:00,11.23,71,100,7.2,228,0.29,0.29,1.0,8.1e-05,1.0
17,1507334400,2017-10-07 00:00:00+00:00,2017-10-07 17:00:00+00:00,10.66,76,100,7.63,227,0.4,0.4,1.0,0.000111,1.0


In [81]:
# 2-creat mean_rain*probability
df_filtered['rain*probability'] = df_filtered['rain'] * df_filtered['probability']
#set the index 
df_filtered.reset_index(drop=True, inplace=True)
#remove the hours in the date
df_filtered['slice dt iso'] = pd.to_datetime(df_filtered['slice dt iso'], format='%Y-%m-%d').dt.date
df_filtered['forecast dt iso'] = pd.to_datetime(df_filtered['forecast dt iso'], format='%Y-%m-%d').dt.date
#from object to date
df_filtered['slice dt iso'] = pd.to_datetime(df_filtered['slice dt iso'])
df_filtered['forecast dt iso'] = pd.to_datetime(df_filtered['forecast dt iso'])
df_filtered.head(200)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['rain*probability'] = df_filtered['rain'] * df_filtered['probability']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['slice dt iso'] = pd.to_datetime(df_filtered['slice dt iso'], format='%Y-%m-%d').dt.date
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered['forecast

Unnamed: 0,forecast dt unixtime,forecast dt iso,slice dt iso,temperature,humidity,clouds,wind_speed,wind_deg,rain,accumulated,hours,rate,probability,rain*probability
0,1507334400,2017-10-07,2017-10-07,11.52,75,82,5.95,266,0.13,0.13,1.0,0.000036,0.24,0.0312
1,1507334400,2017-10-07,2017-10-07,12.22,69,88,6.35,266,0.12,0.12,1.0,0.000033,0.30,0.0360
2,1507334400,2017-10-07,2017-10-07,12.63,65,91,6.67,263,0.11,0.11,1.0,0.000031,0.26,0.0286
3,1507334400,2017-10-07,2017-10-07,12.78,64,93,6.88,257,0.00,0.00,1.0,0.000025,0.19,0.0000
4,1507334400,2017-10-07,2017-10-07,12.74,63,94,6.95,250,0.00,0.00,1.0,0.000022,0.21,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1507420800,2017-10-08,2017-10-10,13.69,78,100,5.56,242,0.37,0.37,1.0,0.000103,1.00,0.3700
196,1507420800,2017-10-08,2017-10-10,13.55,80,100,6.32,247,0.42,0.42,1.0,0.000117,1.00,0.4200
197,1507420800,2017-10-08,2017-10-10,13.53,83,100,6.54,253,0.42,0.42,1.0,0.000117,0.91,0.3822
198,1507420800,2017-10-08,2017-10-10,13.57,85,100,6.34,258,0.39,0.39,1.0,0.000108,0.80,0.3120


In [83]:
df_final = df_filtered.groupby(df_filtered['slice dt iso'].dt.date).agg({'forecast dt iso': 'first','slice dt iso': 'first',
                                                           'temperature': 'mean', 'humidity': 'mean', 
                                                            'clouds': 'mean', 'wind_speed': 'mean',
                                                            'wind_deg': 'mean', 'accumulated': 'mean',
                                                            'rain*probability': 'sum'})

df_final = df_final.reset_index(drop=True)
df_final.tail(50)

    


Unnamed: 0,forecast dt iso,slice dt iso,temperature,humidity,clouds,wind_speed,wind_deg,accumulated,rain*probability
1877,2022-11-12,2022-11-27,5.555625,80.866667,78.2625,3.075958,151.941667,0.008,1.1152
1878,2022-11-13,2022-11-28,5.065792,80.291667,83.925,3.793458,159.541667,0.034167,7.06
1879,2022-11-14,2022-11-29,4.685542,81.445833,88.7,2.89975,154.145833,0.057375,7.573
1880,2022-11-15,2022-11-30,4.522667,77.454167,83.604167,3.146625,127.2125,0.019333,2.6731
1881,2022-11-16,2022-12-01,3.176417,77.729167,70.85,3.357083,117.9875,0.004625,0.4965
1882,2022-11-17,2022-12-02,2.064708,75.529167,70.25,3.16175,116.875,0.004708,0.7462
1883,2022-11-18,2022-12-03,1.5995,79.066667,82.954167,3.040583,121.470833,0.019333,1.8344
1884,2022-11-19,2022-12-04,1.670167,85.2125,76.491667,2.772708,97.941667,0.01,0.0685
1885,2022-11-20,2022-12-05,1.913833,87.3625,84.0375,2.831167,155.366667,0.045083,2.3808
1886,2022-11-21,2022-12-06,1.554375,85.095833,78.7875,3.296667,173.354167,0.078708,5.5862


In [82]:
#day by day working
import numpy as np

df_filtered[df_filtered["forecast dt iso"]== '2017-10-07'].groupby(df_filtered['slice dt iso'].dt.date).agg({'forecast dt iso': 'first','slice dt iso': 'first',
                                                           'temperature': 'mean', 'humidity': 'mean', 
                                                            'clouds': 'mean', 'wind_speed': 'mean',
                                                            'wind_deg': 'mean', 'accumulated': 'mean',
                                                            'rain*probability': 'sum'})

Unnamed: 0_level_0,forecast dt iso,slice dt iso,temperature,humidity,clouds,wind_speed,wind_deg,accumulated,rain*probability
slice dt iso,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2017-10-07,2017-10-07,2017-10-07,11.623333,72.866667,96.0,7.092,243.733333,0.282,3.8806
2017-10-08,2017-10-07,2017-10-08,12.103333,58.0,24.133333,4.310667,313.866667,0.008667,0.0741
2017-10-09,2017-10-07,2017-10-09,11.486667,64.4,44.066667,3.703333,271.0,0.0,0.0
2017-10-10,2017-10-07,2017-10-10,13.195333,78.066667,98.533333,6.192667,242.466667,0.502667,7.0928
2017-10-11,2017-10-07,2017-10-11,11.415333,77.133333,92.066667,6.812667,294.8,0.237333,2.565
2017-10-12,2017-10-07,2017-10-12,12.585333,82.666667,93.866667,4.808,250.933333,0.043333,0.2476
2017-10-13,2017-10-07,2017-10-13,14.324667,78.4,73.933333,3.118667,266.533333,0.0,0.0
2017-10-14,2017-10-07,2017-10-14,14.774,84.466667,48.533333,1.928667,290.466667,0.0,0.0
2017-10-15,2017-10-07,2017-10-15,15.944,73.2,0.0,2.134667,113.933333,0.0,0.0
2017-10-16,2017-10-07,2017-10-16,14.585333,63.2,0.266667,4.928,117.933333,0.0,0.0


In [93]:
#Extracting the list of dates to make a bolean masks for each 

dates_elements = df_filtered["forecast dt iso"].unique()

numpy.datetime64('2017-10-07T00:00:00.000000000')

In [115]:
# For loop to create a list of dataframes we want concatenate
# i is date 

df_list = []

for i in dates_elements:
    date_df = df_filtered[df_filtered["forecast dt iso"]== i].groupby(df_filtered['slice dt iso'].dt.date).agg({'forecast dt iso': 'first','slice dt iso': 'first',
                                                           'temperature': 'mean', 'humidity': 'mean', 
                                                            'clouds': 'mean', 'wind_speed': 'mean',
                                                            'wind_deg': 'mean', 'accumulated': 'mean',
                                                            'rain*probability': 'sum'})
    df_list.append(date_df)



In [133]:
#Creating the final dataframe aftere droping accumulated col and resetting the index
final_df = pd.concat(df_list, axis=0)
del final_df['accumulated']
final_df.reset_index(drop=True, inplace=True)
#rename colm
final_df = final_df.rename(columns={'slice dt iso': 'ds','temperature': 'temp','rain*probability':'rain'})
#set the decimals 
final_df['temp'] = final_df['temp'].round(2)
final_df['humidity'] = final_df['humidity'].round(0)
final_df['clouds'] = final_df['clouds'].round(0)
final_df['wind_speed'] = final_df['wind_speed'].round(2)
final_df['wind_deg'] = final_df['wind_deg'].round(0)
final_df['rain'] = final_df['rain'].round(2)

#save it as csv
final_df.to_csv('finall_pred_weather.csv', index=False)



