# Prepare dataset of non-failing HDDs

In [1]:
# Imports
import pandas as pd
import numpy as np
import glob
from pathlib import Path
import pyarrow as pa
import pyarrow.parquet as pq
import dask.dataframe as ddf
from datetime import datetime, timedelta

Get list of serial numbers for all drives that have failed. Use list to ignore these HDDs when collecting non-failing drives.

In [5]:
# Read day_minus_0 failures data (all failures in the dataset)
df_day_minus_0 = pd.read_csv('C:/com748/code/com748/data/processed/day_minus_0/failures.csv')
failed_drives = df_day_minus_0.serial_number.unique()
failure_days = df_day_minus_0.file_date.value_counts()
model_failure_counts = df_day_minus_0.model.value_counts()

# Use model failure counts to find and balance with healthy drives
print(model_failure_counts)

model
ST4000DM000      5602
ST12000NM0007    2106
ST8000NM0055     1718
ST3000DM001      1454
ST12000NM0008    1349
ST8000DM002      1037
ST14000NM001G     418
Name: count, dtype: int64


Model ST4000DM000

In [21]:
model_target_value = 5602

parquet_files = glob.glob('C:/com748/data/processed/daily/*.parquet')

rows_list = []

for file in reversed(parquet_files):

    date = Path(file).stem
    df = pd.read_parquet(file, engine='pyarrow', filters=[("model", "==", "ST4000DM000")])

    if len(df.index) == 0: continue

    df_filtered = df[~df['serial_number'].isin(failed_drives)]

    if len(df_filtered.index) == 0: continue

    daily_sample_count = 20
    row_count = len(df_filtered.index)
    sample_size = min(daily_sample_count, row_count)

    df_random_sample = df_filtered.sample(n=sample_size, ignore_index=True)
    df_random_sample_dict = df_random_sample.to_dict(orient='records')

    rows_list.extend(df_random_sample_dict)

df_non_failing_drives = pd.DataFrame(rows_list)
df_non_failing_drives.to_csv('C:/com748/code/com748/data/processed/non_failing_drives/random_ST4000DM000.csv', index=False)
