In [1]:
import pandas as pd
import numpy as np
from api_utils import init_settings

start_date,end_date = init_settings()

In [2]:
df = pd.read_csv(f'data/01_raw/{start_date}_{end_date}.csv')
df.head()

Unnamed: 0,date,lat,long,country,city,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm,description
0,2023-04-01,-34.59,-58.67,Argentina,Buenos Aires,11.3,23.8,17.0,54.0,0.0,Partly cloudy
1,2023-04-02,-34.59,-58.67,Argentina,Buenos Aires,14.6,28.1,20.8,53.0,0.0,Partly cloudy
2,2023-04-03,-34.59,-58.67,Argentina,Buenos Aires,15.2,19.9,17.6,88.0,26.0,Light rain shower
3,2023-04-04,-34.59,-58.67,Argentina,Buenos Aires,14.9,24.8,19.1,78.0,0.0,Fog
4,2023-04-05,-34.59,-58.67,Argentina,Buenos Aires,17.6,24.7,20.5,71.0,0.1,Patchy rain possible


### *Transformations*:
#### Deribed columns
1. *cloud_coverage*: [boolean] there was a cloudy sky/ low visibility
2. *rain*: [boolean]  it rained
3. *rain_tomorrow*: [boolean] next day rained
4. *mean_avg_humidity_3days*: [float] 3 days window mean of avgtemp_c (media movil)
5. *humidity_ratio*: [float] daily humidity encrease vs previous day

#### 1. cloud_coverage

In [3]:
df["description"].value_counts()

Sunny                   16
Partly cloudy           11
Light rain shower        4
Cloudy                   3
Overcast                 3
Patchy rain possible     2
Fog                      1
Light rain               1
Moderate rain            1
Heavy rain               1
Mist                     1
Name: description, dtype: int64

In [4]:
df["description"] = df["description"].str.lower()
df["cloud_coverage"]=np.where(df["description"].str.contains("sunny"), 0, 1)


In [5]:
df["cloud_coverage"].value_counts()

1    28
0    16
Name: cloud_coverage, dtype: int64

#### 2. rain

In [6]:
df["rain"] = np.where(df["totalprecip_mm"]>0,1,0)
df["rain"].value_counts()

0    33
1    11
Name: rain, dtype: int64

#### 3. rain_tomorrow

In [7]:
df['rain_tomorrow'] = df['rain'].shift(-1)
df[['rain_tomorrow','rain',"date"]].head()

Unnamed: 0,rain_tomorrow,rain,date
0,0.0,0,2023-04-01
1,1.0,0,2023-04-02
2,0.0,1,2023-04-03
3,1.0,0,2023-04-04
4,0.0,1,2023-04-05


#### 4. mean_avg_humidity_3days

In [9]:
def calculate_mean_avg_humidity(data):
    # Sort the data by date in ascending order
    data.sort_values('date', inplace=True)
    
    # Create a new column to store the mean of the previous 3 days' average humidity
    data['mean_avghumidity_3days'] = pd.Series(dtype='float64')
    
    # Iterate over each row in the dataframe
    for i, row in data.iterrows():
        if i >= 3:
            # Calculate the mean of the previous 3 days' average humidity
            mean_avg_humidity_3days = data.loc[i-3:i-1, 'avghumidity'].mean()
            data.at[i, 'mean_avghumidity_3days'] = mean_avg_humidity_3days
    
    return data
    
calculate_mean_avg_humidity(df)
df[["date","avghumidity","mean_avghumidity_3days"]].head()

Unnamed: 0,date,avghumidity,mean_avghumidity_3days
0,2023-04-01,54.0,
1,2023-04-02,53.0,
2,2023-04-03,88.0,
3,2023-04-04,78.0,65.0
4,2023-04-05,71.0,73.0


#### 5. humidity_ratio

In [10]:
def calculate_humidity_ratio(data):
    # Sort the data by date in ascending order
    data.sort_values('date', inplace=True)
    
    # Calculate the ratio of avg_humidity to the previous day's avg_humidity
    data['humidity_ratio'] = df.loc[:, 'avghumidity'].pct_change().round(decimals=2)
    
    return data
    
calculate_humidity_ratio(df)
df.head()

Unnamed: 0,date,lat,long,country,city,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm,description,cloud_coverage,rain,rain_tomorrow,mean_avghumidity_3days,humidity_ratio
0,2023-04-01,-34.59,-58.67,Argentina,Buenos Aires,11.3,23.8,17.0,54.0,0.0,partly cloudy,1,0,0.0,,
1,2023-04-02,-34.59,-58.67,Argentina,Buenos Aires,14.6,28.1,20.8,53.0,0.0,partly cloudy,1,0,1.0,,-0.02
2,2023-04-03,-34.59,-58.67,Argentina,Buenos Aires,15.2,19.9,17.6,88.0,26.0,light rain shower,1,1,0.0,,0.66
3,2023-04-04,-34.59,-58.67,Argentina,Buenos Aires,14.9,24.8,19.1,78.0,0.0,fog,1,0,1.0,65.0,-0.11
4,2023-04-05,-34.59,-58.67,Argentina,Buenos Aires,17.6,24.7,20.5,71.0,0.1,patchy rain possible,1,1,0.0,73.0,-0.09


#### Store transformed data

In [11]:
df.to_csv(f'data/02_transformed/{start_date}_{end_date}.csv',index=False)