In [1]:
import pandas as pd
import numpy as np
from api_utils import init_settings

start_date,end_date = init_settings()

In [2]:
df = pd.read_csv(f'data/01_raw/{start_date}_{end_date}.csv')
df.head()

Unnamed: 0,date,lat,long,country,city,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm,description
0,2023-01-01,-34.59,-58.67,Argentina,Buenos Aires,21.1,27.0,24.2,52.0,8.7,Moderate or heavy rain shower
1,2023-01-02,-34.59,-58.67,Argentina,Buenos Aires,16.3,33.2,24.5,45.0,0.0,Sunny
2,2023-01-03,-34.59,-58.67,Argentina,Buenos Aires,18.8,33.5,26.7,26.0,0.1,Patchy rain possible
3,2023-01-04,-34.59,-58.67,Argentina,Buenos Aires,18.3,31.9,25.0,42.0,0.0,Sunny
4,2023-01-05,-34.59,-58.67,Argentina,Buenos Aires,19.2,34.0,26.3,42.0,0.0,Sunny


In [3]:
df.describe()

Unnamed: 0,lat,long,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm
count,134.0,134.0,134.0,134.0,134.0,134.0,134.0
mean,-34.59,-58.67,18.846269,30.41791,23.954478,53.365672,2.349701
std,6.418881e-14,5.705672e-14,4.789096,6.340037,5.227682,14.452692,5.699239
min,-34.59,-58.67,9.5,16.0,14.0,24.0,0.0
25%,-34.59,-58.67,14.95,24.925,19.675,43.25,0.0
50%,-34.59,-58.67,19.2,31.2,24.65,53.0,0.0
75%,-34.59,-58.67,22.5,35.175,28.1,63.75,0.675
max,-34.59,-58.67,29.5,43.9,35.1,94.0,31.3


### *Transformations*:
#### Deribed columns
1. *cloud_coverage*: [boolean] there was a cloudy sky/ low visibility
2. *rain*: [boolean]  it rained
3. *rain_tomorrow*: [boolean] next day rained
4. *mean_avg_humidity_3days*: [float] 3 days window mean of avgtemp_c (media movil)
5. *humidity_ratio*: [float] daily humidity encrease vs previous day

#### 1. cloud_coverage

In [4]:
df["description"].value_counts()

Sunny                             51
Partly cloudy                     21
Patchy rain possible              18
Moderate or heavy rain shower      8
Overcast                           8
Cloudy                             7
Light rain shower                  7
Moderate rain at times             5
Moderate rain                      2
Patchy light rain with thunder     1
Patchy light rain                  1
Light drizzle                      1
Fog                                1
Light rain                         1
Heavy rain                         1
Mist                               1
Name: description, dtype: int64

In [5]:
df["description"] = df["description"].str.lower()
df["cloud_coverage"]=np.where(df["description"].str.contains("sunny"), 0, 1)


In [6]:
df["cloud_coverage"].value_counts()

1    83
0    51
Name: cloud_coverage, dtype: int64

#### 2. rain

In [7]:
df["rain"] = np.where(df["totalprecip_mm"]>0,1,0)
df["rain"].value_counts()

0    86
1    48
Name: rain, dtype: int64

#### 3. rain_tomorrow

In [8]:
df['rain_tomorrow'] = df['rain'].shift(-1)
df[['rain_tomorrow','rain',"date"]].head()

Unnamed: 0,rain_tomorrow,rain,date
0,0.0,1,2023-01-01
1,1.0,0,2023-01-02
2,0.0,1,2023-01-03
3,0.0,0,2023-01-04
4,0.0,0,2023-01-05


#### 4. mean_avg_humidity_3days

In [9]:
def calculate_mean_avg_humidity(data):
    # Sort the data by date in ascending order
    data.sort_values('date', inplace=True)
    
    # Create a new column to store the mean of the previous 3 days' average humidity
    data['mean_avghumidity_3days'] = pd.Series(dtype='float64')
    
    # Iterate over each row in the dataframe
    for i, row in data.iterrows():
        if i >= 3:
            # Calculate the mean of the previous 3 days' average humidity
            mean_avg_humidity_3days = data.loc[i-3:i-1, 'avghumidity'].mean()
            data.at[i, 'mean_avghumidity_3days'] = mean_avg_humidity_3days
    
    return data['mean_avghumidity_3days']
    
df['mean_avghumidity_3days'] = calculate_mean_avg_humidity(df)
df[["date","avghumidity","mean_avghumidity_3days"]].head()

Unnamed: 0,date,avghumidity,mean_avghumidity_3days
0,2023-01-01,52.0,
1,2023-01-02,45.0,
2,2023-01-03,26.0,
3,2023-01-04,42.0,41.0
4,2023-01-05,42.0,37.666667


#### 5. humidity_ratio

In [10]:
def calculate_humidity_ratio(data):
    # Sort the data by date in ascending order
    data.sort_values('date', inplace=True)
    
    # Calculate the ratio of avg_humidity to the previous day's avg_humidity
    return data.loc[:, 'avghumidity'].pct_change().round(decimals=2)
    
df['humidity_ratio'] = calculate_humidity_ratio(df)
df.head()

Unnamed: 0,date,lat,long,country,city,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm,description,cloud_coverage,rain,rain_tomorrow,mean_avghumidity_3days,humidity_ratio
0,2023-01-01,-34.59,-58.67,Argentina,Buenos Aires,21.1,27.0,24.2,52.0,8.7,moderate or heavy rain shower,1,1,0.0,,
1,2023-01-02,-34.59,-58.67,Argentina,Buenos Aires,16.3,33.2,24.5,45.0,0.0,sunny,0,0,1.0,,-0.13
2,2023-01-03,-34.59,-58.67,Argentina,Buenos Aires,18.8,33.5,26.7,26.0,0.1,patchy rain possible,1,1,0.0,,-0.42
3,2023-01-04,-34.59,-58.67,Argentina,Buenos Aires,18.3,31.9,25.0,42.0,0.0,sunny,0,0,0.0,41.0,0.62
4,2023-01-05,-34.59,-58.67,Argentina,Buenos Aires,19.2,34.0,26.3,42.0,0.0,sunny,0,0,0.0,37.666667,0.0


In [11]:
df.head()

Unnamed: 0,date,lat,long,country,city,mintemp_c,maxtemp_c,avgtemp_c,avghumidity,totalprecip_mm,description,cloud_coverage,rain,rain_tomorrow,mean_avghumidity_3days,humidity_ratio
0,2023-01-01,-34.59,-58.67,Argentina,Buenos Aires,21.1,27.0,24.2,52.0,8.7,moderate or heavy rain shower,1,1,0.0,,
1,2023-01-02,-34.59,-58.67,Argentina,Buenos Aires,16.3,33.2,24.5,45.0,0.0,sunny,0,0,1.0,,-0.13
2,2023-01-03,-34.59,-58.67,Argentina,Buenos Aires,18.8,33.5,26.7,26.0,0.1,patchy rain possible,1,1,0.0,,-0.42
3,2023-01-04,-34.59,-58.67,Argentina,Buenos Aires,18.3,31.9,25.0,42.0,0.0,sunny,0,0,0.0,41.0,0.62
4,2023-01-05,-34.59,-58.67,Argentina,Buenos Aires,19.2,34.0,26.3,42.0,0.0,sunny,0,0,0.0,37.666667,0.0


#### Store transformed data

In [12]:
df.to_csv(f'data/02_transformed/{start_date}_{end_date}.csv',index=False)