In [1]:
from experiment.DataLoader import DataLoader
# import matplotlib.pyplot as plt
from datetime import date
import polars as pl
import altair as alt


data_loader = DataLoader()
metadata_df = data_loader.load_bodmas_metadata()

metadata_df = metadata_df.select(
        [pl.col("date"), pl.col("is_malware")]
        )

# malware_dates = metadata_df.filter(pl.col("is_malware")).select("date").to_series()
# goodware_dates = metadata_df.filter(pl.col("is_malware").not_()).select("date").to_series()

# plt.figure(figsize=(12, 6))
# plt.hist([malware_dates.to_numpy(), goodware_dates.to_numpy()], bins=30, label=['Malware', 'Goodware'])
# plt.xlabel("Date")
# plt.ylabel("Frequency")
# plt.legend()
# plt.xticks(rotation=45)
# plt.tight_layout()
# plt.savefig("timestamp_histogram.png")
# plt.show()

In [2]:
expr_before_first_months = pl.col("date").lt(date(2020, 1, 1))
expr_first_six_months = pl.col("date").gt(date(2020, 1, 1)) & pl.col("date").lt(date(2020, 6, 1))
expr_last_six_months = pl.col("date").gt(date(2020, 6, 1))
expr_is_malware = pl.col("is_malware")
expr_is_not_malware = pl.col("is_malware").not_()

expr_before_march = pl.col("date").lt(date(2020, 3, 1))
expr_after_march = pl.col("date").gt(date(2020, 3, 1))

In [3]:
t1_m = metadata_df.filter(expr_before_first_months & expr_is_malware).height
t1_g = metadata_df.filter(expr_before_first_months & expr_is_not_malware).height
t2_m = metadata_df.filter(expr_first_six_months & expr_is_malware).height
t2_g = metadata_df.filter(expr_first_six_months & expr_is_not_malware).height
t3_m = metadata_df.filter(expr_last_six_months & expr_is_malware).height
t3_g = metadata_df.filter(expr_last_six_months & expr_is_not_malware).height

tot = t1_m + t1_g + t2_m + t2_g + t3_m + t3_g

print(f"T1: m = {t1_m}, g = {t1_g}, m_ratio = {round(t1_m / (t1_m + t1_g), 2)}, percentage = {round((t1_g + t1_m) / tot, 2)}")
print(f"T2: m = {t2_m}, g = {t2_g}, m_ratio = {round(t2_m / (t2_m + t2_g), 2)}, percentage = {round((t2_g + t2_m) / tot, 2)}")
print(f"T3: m = {t3_m}, g = {t3_g}, m_ratio = {round(t3_m / (t3_m + t3_g), 2)}, percentage = {round((t3_g + t3_m) / tot, 2)}")

T1: m = 15823, g = 31328, m_ratio = 0.34, percentage = 0.37
T2: m = 23748, g = 18651, m_ratio = 0.56, percentage = 0.33
T3: m = 17454, g = 21103, m_ratio = 0.45, percentage = 0.3


In [4]:
plot_df = metadata_df.filter(expr_after_march)

In [5]:
plot_df = plot_df.sort("date").group_by_dynamic(
    "date", 
    every="1w"
).agg([
    pl.col("is_malware").sum().alias("malware"),
    pl.col("is_malware").not_().sum().alias("goodware")
])

In [6]:
base = alt.Chart(plot_df).encode(x='date:T')

malware_layer = base.mark_line(color='red').encode(y='malware:Q')
goodware_layer = base.mark_line(color='blue').encode(y='goodware:Q')

(malware_layer + goodware_layer).show()

# alt.Chart(plot_df).mark_point().encode(
#     x="date", 
#     y=["malware", "goodware"], 
# ).show()

In [7]:
from experiment.DataLoader import DataLoader


data_loader = DataLoader()
metadata_df = data_loader.load_bodmas_metadata()
X, y = data_loader.load_bodmas_data()

X_train, y_train, X_val, y_val, X_cal, y_cal, test_sets = data_loader.split_data(metadata_df, X, y)

In [8]:
arrays = [X_train, y_train, X_val, y_val, X_cal, y_cal]

print("Arrays shape:")
for array in arrays:
    print(array.shape)

print("\nTest weeks:")
print(test_sets[0].keys())
dates = [set_['week_start'].date() for set_ in test_sets]
print(dates[0])
print(dates[1])
print(dates[2])
print("...")
print(dates[-3])
print(dates[-2])
print(dates[-1])

Arrays shape:
(31854, 2381)
(31854,)
(6826, 2381)
(6826,)
(6827, 2381)
(6827,)

Test weeks:
dict_keys(['week_start', 'X_test', 'y_test'])
2020-02-24
2020-03-02
2020-03-09
...
2020-09-14
2020-09-21
2020-09-28


In [9]:
import numpy as np

print("Check y ratio across partitions...")
print(round(np.sum(y_train)/y_train.shape[0], 2), round(np.sum(y_val)/y_val.shape[0], 2), round(np.sum(y_cal)/y_cal.shape[0], 2))

Check y ratio across partitions...
0.1 0.1 0.1
