In [39]:
import pandas as pd

In [40]:
df = pd.read_csv("vegemite.csv")

In [41]:
df = df.sample(frac=1).reset_index(drop=True)

In [42]:
# Count class occurrences and determine minimum class size
class_counts = df['Class'].value_counts()
min_class_size = min(class_counts) 
samples_per_class = min(300, min_class_size)  # Ensure at least 300 samples per class

# Randomly sample from each class
df_sampled = df.groupby('Class').apply(lambda x: x.sample(n=samples_per_class, random_state=42)).reset_index(drop=True)

# Adjust the number of samples to 1000
if len(df_sampled) < 1000:
    additional_samples = df.sample(n=1000 - len(df_sampled), random_state=42)
    df_sampled = pd.concat([df_sampled, additional_samples]).reset_index(drop=True)
elif len(df_sampled) > 1000:
    df_sampled = df_sampled.sample(n=1000, random_state=42).reset_index(drop=True)

# Check the number of sample perclass in 1000 sampled data point
df_sampled["Class"].value_counts()

Class
2    347
1    339
0    314
Name: count, dtype: int64

In [43]:
# Separate the remaining data points
df_train = df[~df.index.isin(df_sampled.index)]

# 1. Remove constant value columns
constant_columns = df_train.columns[df_train.nunique() == 1]
df_train = df_train.drop(columns=constant_columns)
print(f"Removed {len(constant_columns)} constant columns")

Removed 2 constant columns


In [44]:
constant_columns

Index(['TFE Steam temperature SP', 'TFE Product out temperature'], dtype='object')

In [45]:
# 2. Convert columns with few integer values to categorical
int_columns = df_train.select_dtypes(include=['int64']).columns
for col in int_columns:
    if df_train[col].nunique() < 10:  # You can adjust this threshold
        df_train[col] = df_train[col].astype('category')
        print(f"Converted {col} to categorical")

Converted Class to categorical


In [46]:
# 3. Check class balance
class_distribution = df_train['Class'].value_counts(normalize=True)
print("Class distribution:")
print(class_distribution)

# use class weights to handle imbalance distribution
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

# Sort the classes before computing class weights
classes = np.array(sorted(df_train['Class'].unique()))
class_weights = compute_class_weight('balanced', classes=classes, y=df_train['Class'])
class_weight_dict = dict(zip(classes, class_weights))
print("Class weights:", class_weight_dict)

Class distribution:
Class
2    0.495540
1    0.332022
0    0.172438
Name: proportion, dtype: float64
Class weights: {0: 1.93306177868296, 1: 1.0039489457725126, 2: 0.6726671391448146}


In [47]:
# 4. Explore and add composite features
df_train['FFTE_ratio'] = df_train['FFTE Feed tank level SP'] / df_train['FFTE Production solids SP']
print("Added composite feature: FFTE_ratio")

Added composite feature: FFTE_ratio


In [48]:
# 5. Count final number of features
num_features = len(df_train.columns) - 1  # Subtract 1 to exclude the target variable
print(f"Final number of features: {num_features}")

Final number of features: 45


In [49]:
# Display the first few rows and info of the prepared dataset
print(df_train.columns)
print(df_train.info())

Index(['FFTE Feed tank level SP', 'FFTE Production solids SP',
       'FFTE Steam pressure SP', 'TFE Out flow SP', 'TFE Production solids SP',
       'TFE Vacuum pressure SP', 'TFE Steam pressure SP', 'FFTE Feed flow SP',
       'FFTE Out steam temp SP', 'Extract tank Level',
       'Extract tank Out flow PV', 'FFTE Discharge density',
       'FFTE Discharge solids', 'FFTE Feed flow rate PV',
       'FFTE Feed tank level PV', 'FFTE Heat temperature 1',
       'FFTE Heat temperature 2', 'FFTE Heat temperature 3',
       'FFTE Out steam temp PV', 'FFTE Production solids PV', 'FFTE Pump 1',
       'FFTE Pump 1 - 2', 'FFTE Pump 2', 'FFTE Steam pressure PV',
       'FFTE Temperature 1 - 1', 'FFTE Temperature 1 - 2',
       'FFTE Temperature 2 - 1', 'FFTE Temperature 2 - 2',
       'FFTE Temperature 3 - 1', 'FFTE Temperature 3 - 2',
       'FFTE Unk Temperature', 'TFE Feed pump', 'TFE Input flow PV',
       'TFE Level', 'TFE Motor current', 'TFE Motor speed', 'TFE Out flow PV',
       'TFE P

In [50]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

# Split dataset into features and target
X = df_train.drop(columns=['Class'])
y = df_train['Class']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest Classifier
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Apply RFE to select top n features
n_features_to_select = 10  # Adjust based on your requirement
rfe = RFE(estimator=rf, n_features_to_select=n_features_to_select)
rfe.fit(X_train, y_train)

# Transform training and testing sets
X_train_rfe = rfe.transform(X_train)
X_test_rfe = rfe.transform(X_test)

# Fit the model using selected features
rf.fit(X_train_rfe, y_train)

# Predict and evaluate the model
y_pred = rf.predict(X_test_rfe)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy with selected features: {accuracy:.2f}")

# Print selected features
selected_features = X.columns[rfe.support_]
print("Selected features:", selected_features)

Model accuracy with selected features: 1.00
Selected features: Index(['FFTE Steam pressure SP', 'TFE Out flow SP', 'FFTE Feed flow SP',
       'Extract tank Out flow PV', 'FFTE Discharge density',
       'FFTE Feed flow rate PV', 'FFTE Heat temperature 1',
       'FFTE Temperature 3 - 1', 'FFTE Temperature 3 - 2',
       'TFE Production solids density'],
      dtype='object')


In [51]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, confusion_matrix
import warnings
warnings.filterwarnings("ignore")

# Initialize models
models = {
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Support Vector Machine": SVC(random_state=42),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB()
}



# Train and evaluate models
for name, model in models.items():
    print(f"\nModel: {name}")
    
    # Train the model
    model.fit(X_train_rfe, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test_rfe)
    
    # Classification report
    print("Classification Report:")
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Convert classification report to a pandas DataFrame
    report_df = pd.DataFrame(report).transpose()
    display(report_df)
    
    # Confusion matrix
    matrix = confusion_matrix(y_test, y_pred)
    
    # Get unique labels from y_test to dynamically create column and index names
    labels = sorted(list(set(y_test)))
    
    print("Confusion Matrix:")
    display(pd.DataFrame(matrix, columns=[f"Predicted {label}" for label in labels],
                       index=[f"Actual {label}" for label in labels]))



Model: Decision Tree
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.977597,0.981595,0.979592,489.0
1,0.972603,0.980871,0.97672,941.0
2,0.990057,0.983075,0.986553,1418.0
accuracy,0.982093,0.982093,0.982093,0.982093
macro avg,0.980085,0.981847,0.980955,2848.0
weighted avg,0.98215,0.982093,0.982109,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,480,7,2
Actual 1,6,923,12
Actual 2,5,19,1394



Model: Random Forest
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.99591,0.99591,0.99591,489.0
1,0.995763,0.998937,0.997347,941.0
2,0.998587,0.996474,0.997529,1418.0
accuracy,0.997191,0.997191,0.997191,0.997191
macro avg,0.996753,0.997107,0.996929,2848.0
weighted avg,0.997194,0.997191,0.997191,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,487,0,2
Actual 1,1,940,0
Actual 2,1,4,1413



Model: Logistic Regression
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,489.0
1,0.372738,0.328374,0.349153,941.0
2,0.553739,0.788434,0.650567,1418.0
accuracy,0.501053,0.501053,0.501053,0.501053
macro avg,0.308826,0.372269,0.33324,2848.0
weighted avg,0.398859,0.501053,0.439276,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,220,269
Actual 1,0,309,632
Actual 2,0,300,1118



Model: Support Vector Machine
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.0,0.0,0.0,489.0
1,1.0,0.001063,0.002123,941.0
2,0.498068,1.0,0.664947,1418.0
accuracy,0.498244,0.498244,0.498244,0.498244
macro avg,0.499356,0.333688,0.222357,2848.0
weighted avg,0.578392,0.498244,0.331774,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,0,489
Actual 1,0,1,940
Actual 2,0,0,1418



Model: K-Nearest Neighbors
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.881764,0.899796,0.890688,489.0
1,0.897275,0.909671,0.90343,941.0
2,0.939068,0.923836,0.93139,1418.0
accuracy,0.915028,0.915028,0.915028,0.915028
macro avg,0.906035,0.911101,0.908503,2848.0
weighted avg,0.91542,0.915028,0.915163,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,440,21,28
Actual 1,28,856,57
Actual 2,31,77,1310



Model: Naive Bayes
Classification Report:


Unnamed: 0,precision,recall,f1-score,support
0,0.320823,0.541922,0.403042,489.0
1,0.424196,0.294368,0.347553,941.0
2,0.641344,0.619182,0.630068,1418.0
accuracy,0.498596,0.498596,0.498596,0.498596
macro avg,0.462121,0.485157,0.460221,2848.0
weighted avg,0.514564,0.498596,0.497743,2848.0


Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,265,96,128
Actual 1,301,277,363
Actual 2,260,280,878


In [52]:

# Train and evaluate models
results = []
for name, model in models.items():
    model.fit(X_train_rfe, y_train)
    y_pred = model.predict(X_test_rfe)
    report = classification_report(y_test, y_pred, output_dict=True)
    matrix = confusion_matrix(y_test, y_pred)
    results.append({
        "Model": name,
        "Accuracy": report["accuracy"],
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"],
        "Confusion Matrix": matrix
    })

# Create comparison table
comparison_table = pd.DataFrame(results)
display(comparison_table)

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score,Confusion Matrix
0,Decision Tree,0.982093,0.98215,0.982093,0.982109,"[[480, 7, 2], [6, 923, 12], [5, 19, 1394]]"
1,Random Forest,0.997191,0.997194,0.997191,0.997191,"[[487, 0, 2], [1, 940, 0], [1, 4, 1413]]"
2,Logistic Regression,0.501053,0.398859,0.501053,0.439276,"[[0, 220, 269], [0, 309, 632], [0, 300, 1118]]"
3,Support Vector Machine,0.498244,0.578392,0.498244,0.331774,"[[0, 0, 489], [0, 1, 940], [0, 0, 1418]]"
4,K-Nearest Neighbors,0.915028,0.91542,0.915028,0.915163,"[[440, 21, 28], [28, 856, 57], [31, 77, 1310]]"
5,Naive Bayes,0.498596,0.514564,0.498596,0.497743,"[[265, 96, 128], [301, 277, 363], [260, 280, 8..."


In [53]:
import joblib

# Save the best model (Random Forest)
joblib.dump(rf, 'best_model_random_forest.pkl')
print("Best model saved as 'best_model_random_forest.pkl'")

Best model saved as 'best_model_random_forest.pkl'


In [54]:
df_unseen = df_sampled  # take the 1000 rows that we have not used

In [55]:
best_model = joblib.load('best_model_random_forest.pkl')

In [56]:
# Preprocess the data to convert columns in each row in the format of the training feature set
def preprocess(df):
    df['FFTE_ratio'] = df['FFTE Feed tank level SP'] / df['FFTE Production solids SP']
    int_columns = df.select_dtypes(include=['int64']).columns
    constant_columns = df.columns[df.nunique() == 1]
    df = df.drop(columns=constant_columns)
    for col in int_columns:
        if df[col].nunique() < 10:
            df[col] = df[col].astype('category')
    return df

df_unseen = preprocess(df_unseen)
X_unseen = df_unseen.drop(columns=['Class'])
y_unseen = df_unseen['Class']
X_unseen = rfe.transform(X_unseen)

In [57]:
y_pred_unseen = best_model.predict(X_unseen)

In [58]:
# Predict the classes for the unseen data
y_pred_unseen = best_model.predict(X_unseen)

# Compare the predictions with the original labels
comparison_df = pd.DataFrame({'Original': y_unseen, 'Predicted': y_pred_unseen})
display(comparison_df.sample(n=5))

# Calculate accuracy
accuracy_unseen = accuracy_score(y_unseen, y_pred_unseen)
print(f"Accuracy on unseen data: {accuracy_unseen:.2f}")

Unnamed: 0,Original,Predicted
256,0,0
656,2,2
747,2,2
925,1,1
669,2,2


Accuracy on unseen data: 1.00


In [59]:
from sklearn.metrics import classification_report, confusion_matrix

report = classification_report(y_unseen, y_pred_unseen)
matrix = confusion_matrix(y_unseen, y_pred_unseen)
print("Classification Report:\n", report)

# Confusion matrix
matrix = confusion_matrix(y_test, y_pred)
labels = sorted(list(set(y_unseen)))

print("Confusion Matrix:")
display(pd.DataFrame(matrix, columns=[f"Predicted {label}" for label in labels],
                    index=[f"Actual {label}" for label in labels]))

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.99      1.00       314
           1       1.00      1.00      1.00       339
           2       1.00      1.00      1.00       347

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000

Confusion Matrix:


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,265,96,128
Actual 1,301,277,363
Actual 2,260,280,878


In [60]:
from sklearn.metrics import classification_report, confusion_matrix
import pandas as pd

# Store results in a list
results = []

# Evaluate each model
for name, model in models.items():
    # Make predictions on unseen data
    y_pred_unseen = model.predict(X_unseen)
    
    # Get classification report
    report = classification_report(y_unseen, y_pred_unseen, output_dict=True)
    
    # Get confusion matrix
    matrix = confusion_matrix(y_unseen, y_pred_unseen)
    
    # Append the relevant metrics to the results list
    results.append({
        "Model": name,
        "Accuracy": report["accuracy"],
        "Precision": report["weighted avg"]["precision"],
        "Recall": report["weighted avg"]["recall"],
        "F1-Score": report["weighted avg"]["f1-score"],
        "Confusion Matrix": matrix  # Store matrix separately for later display
    })

# Create comparison table from the results
comparison_table = pd.DataFrame(results)

# Display the comparison table with metrics (without confusion matrices)
print("Comparison of other model on test data:")
display(comparison_table.drop(columns="Confusion Matrix"))

# Optionally, display confusion matrices separately
print("\nConfusion Matrices:")
for idx, row in comparison_table.iterrows():
    print(f"\nModel: {row['Model']}")
    matrix = row["Confusion Matrix"]
    
    # Get unique labels from y_unseen to dynamically create column and index names
    labels = sorted(list(set(y_unseen)))
    
    # Display the confusion matrix
    display(pd.DataFrame(matrix, columns=[f"Predicted {label}" for label in labels],
                         index=[f"Actual {label}" for label in labels]))


Comparison of other model on test data:


Unnamed: 0,Model,Accuracy,Precision,Recall,F1-Score
0,Decision Tree,0.994,0.994009,0.994,0.994
1,Random Forest,0.998,0.998006,0.998,0.997998
2,Logistic Regression,0.362,0.240466,0.362,0.280257
3,Support Vector Machine,0.347,0.120409,0.347,0.178781
4,K-Nearest Neighbors,0.933,0.933456,0.933,0.932988
5,Naive Bayes,0.479,0.469861,0.479,0.465367



Confusion Matrices:

Model: Decision Tree


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,313,1,0
Actual 1,1,337,1
Actual 2,1,2,344



Model: Random Forest


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,312,1,1
Actual 1,0,339,0
Actual 2,0,0,347



Model: Logistic Regression


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,131,183
Actual 1,0,101,238
Actual 2,0,86,261



Model: Support Vector Machine


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,0,0,314
Actual 1,0,0,339
Actual 2,0,0,347



Model: K-Nearest Neighbors


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,291,12,11
Actual 1,10,310,19
Actual 2,3,12,332



Model: Naive Bayes


Unnamed: 0,Predicted 0,Predicted 1,Predicted 2
Actual 0,172,61,81
Actual 1,106,90,143
Actual 2,67,63,217


In [61]:
# Filter columns ending with 'SP'
sp_columns = [col for col in df.columns if col.endswith('SP')]
X_sp = df[sp_columns]
y = df['Class']

# Split data into training and testing sets
X_train_sp, X_test_sp, y_train_sp, y_test_sp = train_test_split(X_sp, y, test_size=0.2, random_state=42)

In [62]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import export_text

# Initialize and train Decision Tree Classifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_sp, y_train_sp)

# Predict and evaluate the model
y_pred_sp = dt.predict(X_test_sp)
report_sp = classification_report(y_test_sp, y_pred_sp)
matrix_sp = confusion_matrix(y_test_sp, y_pred_sp)
print("Classification Report:\n", report_sp)
print("Confusion Matrix:\n", matrix_sp)

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.88      0.90       573
           1       0.88      0.90      0.89       984
           2       0.93      0.94      0.93      1491

    accuracy                           0.91      3048
   macro avg       0.91      0.90      0.91      3048
weighted avg       0.91      0.91      0.91      3048

Confusion Matrix:
 [[ 505   35   33]
 [  29  881   74]
 [  16   80 1395]]


In [63]:
# Print the decision tree
tree_rules = export_text(dt, feature_names=sp_columns)
print(tree_rules)

|--- TFE Out flow SP <= 2249.11
|   |--- FFTE Steam pressure SP <= 142.56
|   |   |--- FFTE Steam pressure SP <= 119.98
|   |   |   |--- TFE Out flow SP <= 2100.70
|   |   |   |   |--- TFE Vacuum pressure SP <= -67.99
|   |   |   |   |   |--- FFTE Feed flow SP <= 9395.00
|   |   |   |   |   |   |--- TFE Production solids SP <= 64.25
|   |   |   |   |   |   |   |--- TFE Vacuum pressure SP <= -79.94
|   |   |   |   |   |   |   |   |--- FFTE Steam pressure SP <= 102.00
|   |   |   |   |   |   |   |   |   |--- TFE Production solids SP <= 59.00
|   |   |   |   |   |   |   |   |   |   |--- class: 2
|   |   |   |   |   |   |   |   |   |--- TFE Production solids SP >  59.00
|   |   |   |   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |   |   |--- FFTE Steam pressure SP >  102.00
|   |   |   |   |   |   |   |   |   |--- TFE Steam pressure SP <= 122.50
|   |   |   |   |   |   |   |   |   |   |--- FFTE Steam pressure SP <= 112.50
|   |   |   |   |   |   |   |   |   |   |   |--- c