In [1]:
import pandas as pd

In [2]:
df_april = pd.read_csv('/kaggle/input/clean-aws-month/final_data/filled_data_april.csv')
df_october = pd.read_csv('/kaggle/input/clean-aws-month/final_data/filled_data_october.csv')

In [3]:
df_april['datetime'] = pd.to_datetime(df_april['datetime'])
df_october['datetime'] = pd.to_datetime(df_october['datetime'])

In [4]:
def convert_rain_label(df):
    df['AWS'] = df['AWS'].apply(lambda x: 1 if x > 0 else 0)
    return df

In [5]:
df_april = convert_rain_label(df_april)
df_october = convert_rain_label(df_october)

In [6]:
def split_data_by_train_test_ranges(df, train_ranges, test_ranges):
    train_mask = False
    for start, end in train_ranges:
        train_mask |= (df['datetime'] >= start) & (df['datetime'] <= end)

    test_mask = False
    for start, end in test_ranges:
        test_mask |= (df['datetime'] >= start) & (df['datetime'] <= end)

    train_df = df[train_mask]
    test_df = df[test_mask]
    return train_df, test_df

# Huấn luyện tổng quát

### Train tháng 4 2019, tháng 4 2020 và tháng 10 2019 
### Test tháng 10 2020

In [7]:
df_all = pd.concat([df_april, df_october], ignore_index=True)

In [8]:
df_all.shape

(657666, 33)

In [9]:
train_ranges = [
    ("2019-04-01 00:00:00", "2019-04-30 23:00:00"), 
    ("2019-10-01 00:00:00", "2019-10-31 23:00:00"),
    ("2020-04-01 00:00:00", "2020-04-30 23:00:00"),
]

test_ranges = [
    ("2020-10-01 00:00:00", "2020-10-31 23:00:00"),
]

In [10]:
train_df, test_df = split_data_by_train_test_ranges(df_all, train_ranges, test_ranges)

In [11]:
print(f"Train set: {train_df.shape}")
print(f"Test set: {test_df.shape}")

Train set: (444013, 33)
Test set: (213653, 33)


# Huấn luyện theo mùa

### Tháng 4: Train từ 1-4-2019 đến 18-4-2020, Test từ 19-4-2020 đến 30-4-2020

In [12]:
df_april.shape

(305846, 33)

In [13]:
df_october.shape

(351820, 33)

In [14]:
train_april_ranges = [
    ("2019-04-01 00:00:00","2019-04-30 23:00:00"), 
    ("2020-04-01 00:00:00", "2020-04-18 23:00:00")  
]

test_april_ranges = [
    ("2020-04-19 00:00:00", "2020-04-30 23:00:00") 
]

In [15]:
train_april, test_april = split_data_by_train_test_ranges(df_april, train_april_ranges, test_april_ranges)

In [16]:
print(f"Train set: {train_april.shape}")
print(f"Test set: {test_april.shape}")

Train set: (249931, 33)
Test set: (55915, 33)


### Tháng 10: Train từ 1-10-2019 đến 18-10-2020, Test từ 19-10-2020 đến 31-10-2020

In [17]:
train_october_ranges = [
    ("2019-10-01 00:00:00","2019-10-31 23:00:00"), 
    ("2020-10-01 00:00:00", "2020-10-18 23:00:00")  
]

test_october_ranges = [
    ("2020-10-19 00:00:00", "2020-10-31 23:00:00") 
]

In [18]:
train_october, test_october = split_data_by_train_test_ranges(df_october, train_october_ranges, test_october_ranges)

In [19]:
print(f"Train set: {train_october.shape}")
print(f"Test set: {test_october.shape}")

Train set: (262501, 33)
Test set: (89319, 33)
