In [None]:
import pandas as pd
file_path = './data/dataset/disk_data.csv'
cols = ['date', 'serial_number', 'model', 'failure', 'vault_id', 's9_power_on_hours']

df = pd.read_csv(file_path, usecols=cols)


In [None]:
#Print column names and theit NaN count
def print_nan_count_per_column(df):
    nan_count_per_column = df.isna().sum()
    print(nan_count_per_column)

def print_wrong_vault_id_count(df):
    wrong_vault_id_count = df[df['vault_id'] == 'vault_id'].shape[0]
    print('Wrong vault id count', wrong_vault_id_count)

# Print column names and their data types
def print_col_types(df):
    for column, dtype in df.dtypes.items():
        print(f"Column: {column}, Dtype: {dtype}")

def print_wrong_serial_count(df):
    wrong_serial_count = df['serial_number'].str.contains('-').sum()
    print('Wrong serial number count', wrong_serial_count)
        


In [None]:
print_wrong_vault_id_count(df)
print_nan_count_per_column(df)
print_col_types(df)
print_wrong_serial_count(df)

In [None]:
df = df.dropna(axis=0) #Drop row with NaNs
df = df[~( df['vault_id'] == 'vault_id')] #Drop rows with 'vault_id' as value

In [None]:
print_nan_count_per_column(df)
print_wrong_vault_id_count
print_col_types(df)

In [None]:
def can_convert_to_int(value):
    try:
        int(value)
        return True
    except ValueError:
        print('Cannot convert to int: ', value)
        return False
    
# Column to check
column_to_check = 'vault_id'

# Check which elements of the column can be converted to int
convertible_indices = [idx for idx, value in enumerate(df[column_to_check]) if can_convert_to_int(value)]

In [None]:
df['date'] = pd.to_datetime(df['date'])
df['date'] = df['date'].dt.strftime('%Y-%m-%d')
df['date'] = pd.to_datetime(df['date'])
df['serial_number'] = df['serial_number'].astype(str)
df['model'] = df['model'].astype(str)
df['failure'] = df['failure'].astype(bool)
df['vault_id'] = df['vault_id'].astype(int)
df['s9_power_on_hours'] = df['s9_power_on_hours'].astype(float)
df['s9_power_on_hours'] = df['s9_power_on_hours'].astype(int)

In [None]:
print_col_types(df)

In [None]:
import pyarrow.parquet as pq
import pandas as pd

def is_parquet_file(file_path):
    try:
        # Try to read the Parquet file metadata
        pq.read_metadata(file_path)
        return True
    except (pq.lib.ArrowIOError, ValueError):
        # If an error is raised, it is not a Parquet file
        return False

# Example usage
file_path = '~/Desktop/disk_data.parquet'
if is_parquet_file(file_path):
    print(f"{file_path} is a Parquet file.")
else:
    print(f"{file_path} is not a Parquet file.")
    


In [None]:
#Read data
dfp = pd.read_parquet(file_path)
dfp.head()

In [None]:
print_col_types(dfp)