In [1]:
import pandas as pd
import glob
import os

In [2]:
def parse_fault_info(file_name):
    """
    Parse fault type and intensity from filename.
    Example: FCU_Fouling_Cooling_Waterside_Moderate.csv should return:
    fault_type = "Fouling_Cooling_Waterside", fault_intensity = "Moderate"
    """
    
    # Remove file extension and FCU prefix
    parts = file_name.replace(".csv", "").split("_")
    if "FCU" in parts:
        parts.remove("FCU")
    
    # Special case handling based on the documentation in Table 3
    if parts[0] == "FaultFree":
        fault_type = "Fault_free"
        fault_intensity = "NA"
        
    elif parts[0] == "Fouling":
        # Handle patterns like Fouling_Cooling_Waterside_Moderate
        if len(parts) >= 4:
            fault_type = f"{parts[0]}_{parts[1]}_{parts[2]}"
            fault_intensity = "_".join(parts[3:])
        elif len(parts) == 3:
            fault_type = f"{parts[0]}_{parts[1]}"
            fault_intensity = parts[2]
        else:
            fault_type = parts[0]
            fault_intensity = "_".join(parts[1:]) if len(parts) > 1 else "NA"
    
    elif parts[0] == "VLVLeak" or parts[0] == "VLVStuck":
        # Handle patterns like VLVLeak_Heating_20 or VLVStuck_Heating_20
        if len(parts) >= 3:
            fault_type = f"{parts[0]}_{parts[1]}"
            fault_intensity = "_".join(parts[2:])
        else:
            fault_type = parts[0]
            fault_intensity = parts[1] if len(parts) > 1 else "NA"
    
    elif parts[0] == "SensorBias":
        # Handle patterns like SensorBias_RMTemp_+2C
        if len(parts) >= 3:
            fault_type = f"{parts[0]}_{parts[1]}"
            fault_intensity = "_".join(parts[2:])
        else:
            fault_type = parts[0]
            fault_intensity = parts[1] if len(parts) > 1 else "NA"
    
    elif parts[0] == "Control":
        # Handle patterns like Control_CoolingReverse
        fault_type = f"{parts[0]}_{parts[1]}"
        fault_intensity = "NA" # These don't have intensity in the documentation
    
    elif parts[0] == "OADMPRLeak" or parts[0] == "OADMPRStuck":
        # Handle patterns like OADMPRLeak_20
        fault_type = parts[0]
        fault_intensity = "_".join(parts[1:]) if len(parts) > 1 else "NA"
    
    elif parts[0] == "FilterRestriction":
        # Handle patterns like FilterRestriction_10%
        fault_type = parts[0]
        fault_intensity = "_".join(parts[1:]) if len(parts) > 1 else "NA"
    
    elif parts[0] == "OABlockage" or parts[0] == "FanOutletBlockage":
        fault_type = parts[0]
        fault_intensity = "NA"  # These don't have intensity in the documentation
    
    else:
        # Generic fallback
        fault_type = parts[0]
        fault_intensity = "_".join(parts[1:]) if len(parts) > 1 else "NA"
    
    return fault_type, fault_intensity

In [42]:
def load_datasets(data_dir):
    """
    Load all CSV files from the data directory and combine them into a single DataFrame
    with fault labels derived from filenames.
    """
    all_data = []
    
    # List of files 
    fault_files = glob.glob(os.path.join(data_dir, "FCU_*.csv"))
    # fault_files.append(os.path.join(data_dir, "Fault_free.csv"))
    
    for file_path in fault_files:
        file_name = os.path.basename(file_path)
        
        # Extract fault type and fault intensity from dault parsing function
        fault_type, fault_intensity = parse_fault_info(file_name)
        
        # Load data
        try:
            df = pd.read_csv(file_path)

            # Ensure datetime is datetime type
            df['Datetime'] = pd.to_datetime(df['Datetime'], format = '%m/%d/%Y %H:%M')
            
            # Set datetime as index
            df.set_index('Datetime', inplace = True)

            # Resample to 10-minute intervals and aggregate
            # Choose appropriate aggregation methods for each column
            resampled_df = df.resample('10min').agg({
                'FCU_CTRL': lambda x: x.mode().iloc[0] if len(x.mode()) > 1 else x.mode(),   # Most common control mode in the interval
                'FAN_CTRL': lambda x: x.mode().iloc[0] if len(x.mode()) > 1 else x.mode(),   # Most common fan mode
                'RM_TEMP': 'mean',            # Average room temperature
                'RMCLGSPT': 'mean',           # Average cooling setpoint
                'RMHTGSPT': 'mean',           # Average heating setpoint
                'FCU_MAT': 'mean',            # Average mixed air temperature
                'FCU_DAT': 'mean',            # Average discharge air temperature
                'FCU_RAT': 'mean',            # Average return air temperature
                'FCU_CVLV': 'mean',           # Average cooling valve position
                'FCU_CVLV_DM': 'mean',        # Average cooling valve command
                'FCU_CLG_GPM': 'mean',        # Average cooling flow rate
                'FCU_CLG_EWT': 'mean',        # Average entering water temperature
                'FCU_CLG_RWT': 'mean',        # Average return water temperature
                'FCU_HVLV': 'mean',           # Average heating valve position
                'FCU_HVLV_DM': 'mean',        # Average heating valve command
                'FCU_HTG_GPM': 'mean',        # Average heating flow rate
                'FCU_HTG_EWT': 'mean',        # Average entering water temperature
                'FCU_HTG_RWT': 'mean',        # Average return water temperature
                'FCU_DA_CFM': 'mean',         # Average discharge air flow rate
                'FCU_OA_CFM': 'mean',         # Average outside air flow rate
                'FCU_DMPR': 'mean',           # Average damper position
                'FCU_DMPR_DM': 'mean',        # Average damper command
                'FCU_SPD': 'mean',            # Average fan speed
                'FCU_OAT': 'mean',            # Average outside air temperature
                'FCU_WAT': 'mean',            # Average power consumption
                'FCU_MA_HUMD': 'mean',        # Average mixed air humidity
                'FCU_OA_HUMD': 'mean',        # Average outside air humidity
                'FCU_DA_HUMD': 'mean',        # Average discharge air humidity
                'FCU_RA_HUMD': 'mean'         # Average return air humidity
            })

            # Reset index to get timestamp back as a column
            resampled_df.reset_index(inplace = True)
            
            # Add fault labels
            resampled_df['fault_type'] = fault_type
            resampled_df['fault_intensity'] = fault_intensity
            
            # Add to list
            all_data.append(resampled_df)
            print(f"Loaded {file_name}: {fault_type}, {fault_intensity}")
        except Exception as e:
            print(f"Error loading {file_path}: {e}")
    
    # Combine all datasets
    if all_data:
        combined_data = pd.concat(all_data, ignore_index=True)
        return combined_data
    else:
        raise ValueError("No data files were loaded successfully")

In [43]:
data = load_datasets(r'.\datasets')

Loaded FCU_Control_CoolingReverse.csv: Control_CoolingReverse, NA
Loaded FCU_Control_HeatingReverse.csv: Control_HeatingReverse, NA
Loaded FCU_Control_Unstable.csv: Control_Unstable, NA
Loaded FCU_FanOutletBlockage.csv: FanOutletBlockage, NA
Loaded FCU_FaultFree.csv: Fault_free, NA
Loaded FCU_FilterRestriction_10%.csv: FilterRestriction, 10%
Loaded FCU_FilterRestriction_20%.csv: FilterRestriction, 20%
Loaded FCU_FilterRestriction_50%.csv: FilterRestriction, 50%
Loaded FCU_Fouling_Cooling_Airside_Minor.csv: Fouling_Cooling_Airside, Minor
Loaded FCU_Fouling_Cooling_Airside_Moderate.csv: Fouling_Cooling_Airside, Moderate
Loaded FCU_Fouling_Cooling_Airside_Severe.csv: Fouling_Cooling_Airside, Severe
Loaded FCU_Fouling_Cooling_Waterside_Minor.csv: Fouling_Cooling_Waterside, Minor
Loaded FCU_Fouling_Cooling_Waterside_Moderate.csv: Fouling_Cooling_Waterside, Moderate
Loaded FCU_Fouling_Cooling_Waterside_Severe.csv: Fouling_Cooling_Waterside, Severe
Loaded FCU_Fouling_Heating_Airside_Minor.csv

In [201]:
fault_data = data.copy()

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "C:\Users\boluwatife_3po\anaconda3\envs\mnist_classification\lib\site-packages\IPython\core\interactiveshell.py", line 3577, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\boluwatife_3po\AppData\Local\Temp\ipykernel_12128\1795045922.py", line 1, in <module>
    fault_data = data.copy()
  File "C:\Users\boluwatife_3po\anaconda3\envs\mnist_classification\lib\site-packages\pandas\core\generic.py", line 6811, in copy
    data = self._mgr.copy(deep=deep)
  File "C:\Users\boluwatife_3po\anaconda3\envs\mnist_classification\lib\site-packages\pandas\core\internals\managers.py", line 593, in copy
    res = self.apply("copy", deep=deep)
  File "C:\Users\boluwatife_3po\anaconda3\envs\mnist_classification\lib\site-packages\pandas\core\internals\managers.py", line 363, in apply
    applied = getattr(b, f)(**kwargs)
  File "C:\Users\boluwatife_3po\anaconda3\envs\mnist_classification\lib\site-packages\pandas\core\internals\b

In [2]:
fault_data = pd.read_csv('fault_data_2.csv')

In [None]:
columns = list(fault_data.dtypes[fault_data.dtypes == 'object'].index)
for col in columns:
    fault_data[col] = fault_data[col].str.lower().str.replace(' ','_')

In [None]:
fcu_ctrl_map = {
    0: 'shutdown',
    1: 'operate',
    2: 'setback'
}
fault_data['fcu_ctrl'] = fault_data['fcu_ctrl'].map(fcu_ctrl_map)

fan_ctrl_map = {
    1: 'auto',
    2: 'off'
}
fault_data['fan_ctrl'] = fault_data['fan_ctrl'].map(fan_ctrl_map)

In [49]:
fault_data

Unnamed: 0,datetime,fcu_ctrl,fan_ctrl,rm_temp,rmclgspt,rmhtgspt,fcu_mat,fcu_dat,fcu_rat,fcu_cvlv,...,fcu_dmpr_dm,fcu_spd,fcu_oat,fcu_wat,fcu_ma_humd,fcu_oa_humd,fcu_da_humd,fcu_ra_humd,fault_type,fault_intensity
0,2018-01-01 00:00:00,setback,auto,73.224,85.0,55.0,49.609,73.028,73.224,0.922,...,0.0,0.0,25.996,0.0,20.249,56.156,42.528,8.712,control_coolingreverse,na
1,2018-01-01 00:10:00,setback,auto,72.243,85.0,55.0,49.066,70.720,72.243,1.000,...,0.0,0.0,25.889,0.0,20.585,56.416,46.028,8.998,control_coolingreverse,na
2,2018-01-01 00:20:00,setback,auto,71.847,85.0,55.0,48.843,68.574,71.847,1.000,...,0.0,0.0,25.838,0.0,20.724,56.533,49.813,9.120,control_coolingreverse,na
3,2018-01-01 00:30:00,setback,auto,71.649,85.0,55.0,48.748,66.606,71.649,1.000,...,0.0,0.0,25.846,0.0,20.794,56.516,53.179,9.182,control_coolingreverse,na
4,2018-01-01 00:40:00,setback,auto,71.531,85.0,55.0,48.714,64.803,71.531,1.000,...,0.0,0.0,25.900,0.0,20.830,56.386,56.369,9.218,control_coolingreverse,na
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2575435,2018-12-31 23:10:00,setback,auto,61.119,85.0,55.0,35.429,119.940,61.119,0.000,...,0.0,0.0,9.741,0.0,100.000,56.711,10.210,100.000,vlvstuck_heating,80
2575436,2018-12-31 23:20:00,setback,auto,60.898,85.0,55.0,35.145,119.940,60.898,0.000,...,0.0,0.0,9.397,0.0,100.000,55.950,10.210,100.000,vlvstuck_heating,80
2575437,2018-12-31 23:30:00,setback,auto,60.705,85.0,55.0,34.816,119.940,60.705,0.000,...,0.0,0.0,8.925,0.0,100.000,55.368,10.210,100.000,vlvstuck_heating,80
2575438,2018-12-31 23:40:00,setback,auto,60.547,85.0,55.0,34.427,119.938,60.547,0.000,...,0.0,0.0,8.308,0.0,100.000,55.011,10.210,100.000,vlvstuck_heating,80


In [50]:
fault_data.describe()

Unnamed: 0,datetime,rm_temp,rmclgspt,rmhtgspt,fcu_mat,fcu_dat,fcu_rat,fcu_cvlv,fcu_cvlv_dm,fcu_clg_gpm,...,fcu_oa_cfm,fcu_dmpr,fcu_dmpr_dm,fcu_spd,fcu_oat,fcu_wat,fcu_ma_humd,fcu_oa_humd,fcu_da_humd,fcu_ra_humd
count,2575440,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,...,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0,2575440.0
mean,2018-07-02 11:54:59.999997184,70.63071,80.26401,59.73599,62.94292,66.97278,70.62525,0.1443971,0.08284803,0.9633887,...,32.54562,0.1490973,0.1056959,3.891722,50.46472,1.775386,57.31652,64.85543,60.5657,50.08624
min,2018-01-01 00:00:00,14.0,72.0,55.0,-6.885,-7.366,-5.034,0.0,0.0,0.0,...,-85.394,0.0,0.0,0.0,-9.909,0.0,7.563,15.979,10.21,6.22
25%,2018-04-02 05:57:30,67.931,72.0,55.0,53.785,55.096,67.844,0.0,0.0,0.0,...,0.88,0.0,0.0,0.0,33.4335,0.0,42.764,52.892,45.421,36.842
50%,2018-07-02 11:55:00,71.901,85.0,55.0,65.772,63.226,71.872,0.0,0.0,0.0,...,0.88,0.0,0.0,0.0,51.711,0.0,57.606,65.937,59.531,48.065
75%,2018-10-01 17:52:30,74.756,85.0,68.0,72.508,71.093,74.992,0.167,0.0,0.79,...,31.6,0.3,0.3,9.75,69.8975,2.63,70.431,78.481,79.516,62.499
max,2018-12-31 23:50:00,103.477,85.0,68.0,103.466,119.96,103.477,1.0,1.0,5.72,...,5670.474,1.0,0.3,18.08,96.955,22.27,100.0,100.0,100.0,100.0
std,,7.94871,6.241067,6.241067,13.49292,20.16939,8.075227,0.2897609,0.2185449,1.831617,...,243.373,0.2165941,0.142996,5.520355,22.79752,3.843851,19.31104,17.57694,25.83409,18.15363


In [51]:
fault_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2575440 entries, 0 to 2575439
Data columns (total 32 columns):
 #   Column           Dtype         
---  ------           -----         
 0   datetime         datetime64[ns]
 1   fcu_ctrl         object        
 2   fan_ctrl         object        
 3   rm_temp          float64       
 4   rmclgspt         float64       
 5   rmhtgspt         float64       
 6   fcu_mat          float64       
 7   fcu_dat          float64       
 8   fcu_rat          float64       
 9   fcu_cvlv         float64       
 10  fcu_cvlv_dm      float64       
 11  fcu_clg_gpm      float64       
 12  fcu_clg_ewt      float64       
 13  fcu_clg_rwt      float64       
 14  fcu_hvlv         float64       
 15  fcu_hvlv_dm      float64       
 16  fcu_htg_gpm      float64       
 17  fcu_htg_ewt      float64       
 18  fcu_htg_rwt      float64       
 19  fcu_da_cfm       float64       
 20  fcu_oa_cfm       float64       
 21  fcu_dmpr         float64       

In [52]:
fault_data.isnull().sum()

datetime           0
fcu_ctrl           0
fan_ctrl           0
rm_temp            0
rmclgspt           0
rmhtgspt           0
fcu_mat            0
fcu_dat            0
fcu_rat            0
fcu_cvlv           0
fcu_cvlv_dm        0
fcu_clg_gpm        0
fcu_clg_ewt        0
fcu_clg_rwt        0
fcu_hvlv           0
fcu_hvlv_dm        0
fcu_htg_gpm        0
fcu_htg_ewt        0
fcu_htg_rwt        0
fcu_da_cfm         0
fcu_oa_cfm         0
fcu_dmpr           0
fcu_dmpr_dm        0
fcu_spd            0
fcu_oat            0
fcu_wat            0
fcu_ma_humd        0
fcu_oa_humd        0
fcu_da_humd        0
fcu_ra_humd        0
fault_type         0
fault_intensity    0
dtype: int64

In [53]:
categorical = list((fault_data.dtypes[fault_data.dtypes == object]).index)
numerical = list((fault_data.dtypes[fault_data.dtypes != object]).index)

for category in categorical:
    print('\n{} \n\n'.format(fault_data[category].value_counts()))


fcu_ctrl
setback     1636894
operate      907293
shutdown      31253
Name: count, dtype: int64 



fan_ctrl
auto    2544788
off       30652
Name: count, dtype: int64 



fault_type
vlvstuck_heating             262800
vlvstuck_cooling             262800
oadmprstuck                  262800
sensorbias_rmtemp            210240
fouling_heating_airside      157680
vlvleak_heating              157680
vlvleak_cooling              157680
oadmprleak                   157680
fouling_heating_waterside    157680
fouling_cooling_waterside    157680
fouling_cooling_airside      157680
filterrestriction            157680
control_heatingreverse        52560
oablockage                    52560
fault_free                    52560
fanoutletblockage             52560
control_unstable              52560
control_coolingreverse        52560
Name: count, dtype: int64 



fault_intensity
na          315360
80          315360
50          315360
20          262800
minor       210240
moderate    210240
severe    

In [54]:
def engineer_features(df):
    """
    Create engineered features that may help with fault detection
    """
    # Create time-based features
    if 'datetime' in df.columns:
        # df['datetime'] = pd.to_datetime(df['datetime'], format = '%m/%d/%Y_%H:%M')
        df['hour'] = df['datetime'].dt.hour
        df['day_of_week'] = df['datetime'].dt.dayofweek
        df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)

        # Drop the original timestamp as it's repeated across fault scenarios
        df = df.drop('datetime', axis=1)
    
    return df

In [55]:
fault_data = engineer_features(fault_data)

The scope of this project is to detect fault type only.

In [57]:
del fault_data['fault_intensity']

In [59]:
fault_type_map = {
    'fault_free': 0,
    'oadmprleak': 1,
    'oadmprstuck': 2,
    'vlvleak_heating': 3,
    'vlvstuck_heating': 4,
    'vlvleak_cooling': 5,
    'vlvstuck_cooling': 6,
    'sensorbias_rmtemp': 7,
    'fouling_cooling_airside': 8,
    'fouling_cooling_waterside': 9,
    'fouling_heating_airside': 10,
    'fouling_heating_waterside': 11,
    'filterrestriction': 12,
    'oablockage': 13,
    'fanoutletblockage': 14,
    'control_coolingreverse': 15,
    'control_heatingreverse': 16,
    'control_unstable': 17,
}
fault_data.fault_type = fault_data.fault_type.map(fault_type_map)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
df_full_train, df_test = train_test_split(fault_data, test_size = 0.15, random_state = 1)

In [7]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.18, random_state = 1)

In [8]:
print(len(df_train))
print(len(df_val))
print(len(df_test))

1795081
394043
386316


In [9]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [10]:
y_train = df_train.fault_type.values
y_val = df_val.fault_type.values
y_test = df_test.fault_type.values

In [11]:
del df_train['fault_type']
del df_val['fault_type']
del df_test['fault_type']

In [12]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction import DictVectorizer

In [13]:
train_dicts = df_train.to_dict(orient='records')

In [14]:
dv = DictVectorizer(sparse=False)

In [15]:
X_train = dv.fit_transform(train_dicts)

In [16]:
val_dicts = df_val.to_dict(orient='records')

In [17]:
X_val = dv.transform(val_dicts)

## Model Selection

In [18]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, ConfusionMatrixDisplay, precision_score, recall_score, f1_score, roc_curve, roc_auc_score

### SGD Classifier

In [128]:
from sklearn.linear_model import SGDClassifier

In [129]:
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train)

In [154]:
y_pred_sgd = sgd_clf.predict(X_val)

In [155]:
accuracy_score(y_val, y_pred_sgd)

0.3995985209735993

In [163]:
precision_score(y_val, y_pred_sgd, average=None)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([0.        , 0.11135857, 0.96603233, 0.88321812, 0.68304044,
       0.999832  , 0.76931711, 0.        , 0.        , 0.08543417,
       0.2747858 , 0.108595  , 0.88167939, 0.        , 0.12628195,
       0.51184834, 0.35747434, 0.06254125])

In [165]:
recall_score(y_val, y_pred_sgd, average=None)

array([0.        , 0.00417484, 0.70469698, 0.99445226, 0.64178251,
       0.98229833, 0.65119884, 0.        , 0.        , 0.00251671,
       0.05599435, 0.74702578, 0.02885937, 0.        , 0.66358367,
       0.01333992, 0.31030273, 0.14289305])

In [166]:
f1_score(y_val, y_pred_sgd, average=None)

array([0.        , 0.00804797, 0.81492563, 0.93554041, 0.66176904,
       0.99098762, 0.70534711, 0.        , 0.        , 0.00488939,
       0.09303129, 0.18962435, 0.05588935, 0.        , 0.21218454,
       0.02600217, 0.33222244, 0.0870031 ])

### kNN

In [135]:
from sklearn.neighbors import KNeighborsClassifier

In [136]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)

In [174]:
y_pred_knn = knn_clf.predict(X_val)

In [175]:
accuracy_score(y_val, y_pred_knn)

0.5691840738193548

In [176]:
precision_score(y_val, y_pred_knn, average = None)

array([0.04843606, 0.71809955, 0.78689252, 0.85639697, 0.71624516,
       0.65755666, 0.62371767, 0.99374174, 0.20796886, 0.20643586,
       0.2020913 , 0.19238745, 0.75852169, 0.57312482, 0.93918735,
       0.11685393, 0.08020833, 0.79034624])

In [177]:
recall_score(y_val, y_pred_knn, average = None)

array([0.04562407, 0.79505699, 0.66223007, 0.86242444, 0.74018352,
       0.6631731 , 0.67340051, 0.97482647, 0.21594934, 0.36974998,
       0.16469904, 0.2376074 , 0.51709491, 0.49686166, 0.84942656,
       0.03853755, 0.02819824, 0.67701395])

In [178]:
f1_score(y_val, y_pred_knn, average = None)

array([0.04698803, 0.75462128, 0.71919918, 0.85940014, 0.72801761,
       0.66035294, 0.6476076 , 0.98419323, 0.21188398, 0.26494804,
       0.1814892 , 0.21261967, 0.61496174, 0.53227542, 0.89205465,
       0.05796025, 0.04172688, 0.72930346])

In [187]:
fault_data.to_csv('fault_data_2.csv', index = False)

### Decision Tree

In [138]:
from sklearn.tree import DecisionTreeClassifier

In [139]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

In [179]:
y_pred_dt = dt_clf.predict(X_val)

In [180]:
accuracy_score(y_val, y_pred_dt)

0.704577926774489

In [181]:
precision_score(y_val, y_pred_dt, average = None)

array([0.9913523 , 0.89278062, 0.85888972, 0.99941986, 0.70353771,
       0.99788776, 0.79352854, 0.99780193, 0.14584213, 0.5540724 ,
       0.12137799, 0.50331826, 0.82953291, 0.61549296, 0.94058652,
       0.19780518, 0.11393283, 0.99937178])

In [182]:
recall_score(y_val, y_pred_dt, average = None)

array([0.98334162, 0.85237757, 0.85448801, 0.99850956, 0.73389202,
       0.99418197, 0.79340951, 0.99429277, 0.1727428 , 0.60623814,
       0.11614566, 0.49186219, 0.71961021, 0.60343962, 0.91367616,
       0.18033597, 0.10021973, 0.99962297])

In [183]:
f1_score(y_val, y_pred_dt, average = None)

array([0.98733071, 0.8721114 , 0.85668321, 0.9989645 , 0.71839437,
       0.99603142, 0.79346902, 0.99604425, 0.15815675, 0.57898262,
       0.1187042 , 0.49752429, 0.77067166, 0.60940669, 0.92693607,
       0.18866705, 0.10663723, 0.99949736])

### Random Forest

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf_clf = RandomForestClassifier()
rf_clf.fit(X_train, y_train)

In [21]:
y_pred_rf = rf_clf.predict(X_val)

In [22]:
accuracy_score(y_val, y_pred_rf)

0.6913991620203886

In [23]:
precision_score(y_val, y_pred_rf, average = None)

array([0.9933665 , 0.90192749, 0.86536635, 0.99991712, 0.74676373,
       0.99983385, 0.80531408, 0.99966068, 0.0840447 , 0.51303555,
       0.08747027, 0.48879749, 0.75645348, 0.54362242, 0.97085031,
       0.18041106, 0.15098795, 0.99962307])

In [24]:
recall_score(y_val, y_pred_rf, average = None)

array([0.96805072, 0.88690352, 0.83886456, 0.99900638, 0.72444234,
       0.99323293, 0.77823337, 0.9997532 , 0.09024624, 0.58049344,
       0.09477885, 0.5596497 , 0.59796777, 0.52485564, 0.94055987,
       0.17564229, 0.14831543, 0.99987432])

In [25]:
f1_score(y_val, y_pred_rf, average = None)

array([0.98054524, 0.89435241, 0.8519094 , 0.99946154, 0.73543371,
       0.99652246, 0.79154217, 0.99970694, 0.08703514, 0.54468382,
       0.09097801, 0.52182956, 0.66793813, 0.53407422, 0.95546508,
       0.17799474, 0.14963976, 0.99974868])

### Gradient Boosting

In [26]:
from sklearn.ensemble import GradientBoostingClassifier

In [27]:
gb_clf = GradientBoostingClassifier()
gb_clf.fit(X_train, y_train)

In [28]:
y_pred_gb = gb_clf.predict(X_val)

In [29]:
accuracy_score(y_val, y_pred_gb)

0.6657141479483204

In [30]:
precision_score(y_val, y_pred_gb, average = None)

array([0.99471854, 0.38116627, 0.90869612, 0.99966806, 0.80645539,
       0.99991682, 0.83580637, 0.99676818, 0.17287671, 0.28478185,
       0.39578253, 0.27146257, 0.3514819 , 0.60680851, 0.67443741,
       0.46979454, 0.43645833, 0.99974868])

In [31]:
recall_score(y_val, y_pred_gb, average = None)

array([0.95997016, 0.60226276, 0.8329187 , 0.99747454, 0.85743416,
       0.99199505, 0.85906443, 0.98954188, 0.05258114, 0.30241769,
       0.36435816, 0.27325677, 0.6030067 , 0.53703239, 0.61721544,
       0.18922925, 0.05114746, 0.99987432])

In [32]:
f1_score(y_val, y_pred_gb, average = None)

array([0.97703549, 0.46686084, 0.86915888, 0.9985701 , 0.83116382,
       0.99594018, 0.84727582, 0.99314188, 0.0806364 , 0.29333493,
       0.3794208 , 0.27235672, 0.44410367, 0.56979222, 0.64455892,
       0.26978956, 0.09156469, 0.9998115 ])

### Neural Network (Multi Layer Perceptron Classifier)

In [192]:
from sklearn.neural_network import MLPClassifier

In [193]:
mlp_clf = MLPClassifier()
mlp_clf.fit(X_train, y_train)

In [194]:
y_pred_mlp = mlp_clf.predict(X_val)

In [195]:
accuracy_score(y_val, y_pred_mlp)

0.5912806470359834

In [197]:
precision_score(y_val, y_pred_mlp, average = None)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


array([0.        , 0.34267986, 0.98059602, 0.99475721, 0.94366964,
       0.98328759, 0.75984345, 0.47612891, 0.44062187, 0.29528011,
       0.15136667, 0.22634104, 0.31359135, 0.47804244, 0.85670039,
       0.51708931, 0.50196891, 0.99899244])

In [198]:
recall_score(y_val, y_pred_mlp, average = None)

array([0.        , 0.55103745, 0.80714499, 0.98977395, 0.68987641,
       0.99050959, 0.88826662, 0.62486503, 0.18303404, 0.20158429,
       0.17978883, 0.21439194, 0.49060925, 0.69281948, 0.51165372,
       0.05792984, 0.88696289, 0.99685811])

In [199]:
f1_score(y_val, y_pred_mlp, average = None)

array([0.        , 0.42257083, 0.88545618, 0.99225932, 0.79705794,
       0.98688538, 0.81905157, 0.5404504 , 0.25863236, 0.23959789,
       0.16435805, 0.22020451, 0.38261802, 0.56573215, 0.64067326,
       0.10418749, 0.64110822, 0.99792414])