# Data Preprocessing Tools

## Importing the libraries

In [44]:
import numpy as np
import pandas as pd

## Importing the dataset

In [45]:
dataset = pd.read_csv("../../raw/utilization.csv")
headers = list(dataset.columns)
X = dataset.iloc[:, :].values

In [46]:
print(X)

[['JMAN_2429072' '10-02-2023' 2.0 ... 1.0 0 0]
 ['JMAN_9043466' '13-01-2023' 9.0 ... 1.0 0 0]
 ['JMAN_2452556' '24-02-2023' 1.0 ... 1.0 0 0]
 ...
 ['JMAN_9014565' '17-03-2023' 1.0 ... 1.0 0 0]
 ['JMAN_18362858' '24-03-2023' 1.0 ... 0.0 0 0]
 ['JMAN_2456281' '24-03-2023' 10.0 ... 0.8 0 0]]


## Formatting date column  

In [None]:
X[:, 1] = pd.to_datetime(X[:, 1], format="%d-%m-%Y")

In [48]:
print(X)

[['JMAN_2429072' Timestamp('2023-02-10 00:00:00') 2.0 ... 1.0 0 0]
 ['JMAN_9043466' Timestamp('2023-01-13 00:00:00') 9.0 ... 1.0 0 0]
 ['JMAN_2452556' Timestamp('2023-02-24 00:00:00') 1.0 ... 1.0 0 0]
 ...
 ['JMAN_9014565' Timestamp('2023-03-17 00:00:00') 1.0 ... 1.0 0 0]
 ['JMAN_18362858' Timestamp('2023-03-24 00:00:00') 1.0 ... 0.0 0 0]
 ['JMAN_2456281' Timestamp('2023-03-24 00:00:00') 10.0 ... 0.8 0 0]]


## Taking care of missing data

In [49]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(X[:, 2:])
X[:, 2:] = imputer.transform(X[:, 2:])

In [50]:
print(X)

[['JMAN_2429072' Timestamp('2023-02-10 00:00:00') 2.0 ... 1.0 0.0 0.0]
 ['JMAN_9043466' Timestamp('2023-01-13 00:00:00') 9.0 ... 1.0 0.0 0.0]
 ['JMAN_2452556' Timestamp('2023-02-24 00:00:00') 1.0 ... 1.0 0.0 0.0]
 ...
 ['JMAN_9014565' Timestamp('2023-03-17 00:00:00') 1.0 ... 1.0 0.0 0.0]
 ['JMAN_18362858' Timestamp('2023-03-24 00:00:00') 1.0 ... 0.0 0.0 0.0]
 ['JMAN_2456281' Timestamp('2023-03-24 00:00:00') 10.0 ... 0.8 0.0 0.0]]


## Convert to Data Frame

In [51]:
df_imputed = pd.DataFrame(X, columns=headers)

df_cleaned = df_imputed.drop_duplicates()

## Convert back to CSV

In [52]:
df_cleaned.to_csv("../data_set/cleaned_utilization.csv", index=False)