In [2]:
import pandas as pd
import numpy as np

# Data preparation

## Read and shuffle data

In [5]:
df = pd.read_csv('vegemite.csv')

df = df.sample(n=len(df)).reset_index(drop=True)

df.to_csv('vegemite_shuffled.csv', index=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15237 entries, 0 to 15236
Data columns (total 47 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FFTE Feed tank level SP        15237 non-null  float64
 1   FFTE Production solids SP      15237 non-null  float64
 2   FFTE Steam pressure SP         15237 non-null  float64
 3   TFE Out flow SP                15237 non-null  float64
 4   TFE Production solids SP       15237 non-null  float64
 5   TFE Vacuum pressure SP         15237 non-null  float64
 6   TFE Steam pressure SP          15237 non-null  float64
 7   TFE Steam temperature SP       15237 non-null  float64
 8   FFTE Feed flow SP              15237 non-null  float64
 9   FFTE Out steam temp SP         15237 non-null  float64
 10  Extract tank Level             15237 non-null  float64
 11  Extract tank Out flow PV       15237 non-null  float64
 12  FFTE Discharge density         15237 non-null 

## Create training set with 1002 data points

In [24]:
# Define the number of samples per class for the 1000 data points
samples_per_class = 334  # Adjust this depending on your class distribution

# Group by class and sample 1000 data points (e.g., 300 per class)
sampled_df = df.groupby('Class').apply(lambda x: x.sample(samples_per_class)).reset_index(drop=True)

# Remove the sampled 1000 data points from the original dataset to create the training set
df = df.drop(sampled_df.index).reset_index(drop=True)

print(f"Number of rows in the test/validation set: {sampled_df.shape[0]}")
print(f"Number of rows in the training set: {df.shape[0]}")

df.info()

Number of rows in the test/validation set: 1002
Number of rows in the training set: 13335
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13335 entries, 0 to 13334
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FFTE Feed tank level SP        13335 non-null  float64
 1   FFTE Production solids SP      13335 non-null  float64
 2   FFTE Steam pressure SP         13335 non-null  float64
 3   TFE Out flow SP                13335 non-null  float64
 4   TFE Production solids SP       13335 non-null  float64
 5   TFE Vacuum pressure SP         13335 non-null  float64
 6   TFE Steam pressure SP          13335 non-null  float64
 7   FFTE Feed flow SP              13335 non-null  float64
 8   FFTE Out steam temp SP         13335 non-null  float64
 9   Extract tank Level             13335 non-null  float64
 10  Extract tank Out flow PV       13335 non-null  float64
 11  FFTE Discharge d

  sampled_df = df.groupby('Class').apply(lambda x: x.sample(samples_per_class)).reset_index(drop=True)


## Remove columns with constants

In [20]:
constant_columns = [col for col in df.columns if df[col].nunique() == 1]

# Remove those columns
df = df.drop(columns=constant_columns)

print("Columns removed:", constant_columns)

Columns removed: ['TFE Steam temperature SP', 'TFE Product out temperature']


In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 45 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   FFTE Feed tank level SP        14337 non-null  float64
 1   FFTE Production solids SP      14337 non-null  float64
 2   FFTE Steam pressure SP         14337 non-null  float64
 3   TFE Out flow SP                14337 non-null  float64
 4   TFE Production solids SP       14337 non-null  float64
 5   TFE Vacuum pressure SP         14337 non-null  float64
 6   TFE Steam pressure SP          14337 non-null  float64
 7   FFTE Feed flow SP              14337 non-null  float64
 8   FFTE Out steam temp SP         14337 non-null  float64
 9   Extract tank Level             14337 non-null  float64
 10  Extract tank Out flow PV       14337 non-null  float64
 11  FFTE Discharge density         14337 non-null  float64
 12  FFTE Discharge solids          14337 non-null 

In [14]:
# Define the threshold for "few" unique values (e.g., 10 or fewer)
threshold = 200

# Identify integer columns with few unique values
columns_to_convert = [col for col in sampled_df.columns if pd.api.types.is_integer_dtype(df[col]) and sampled_df[col].nunique() <= threshold]

columns_to_convert  

['Class']

## Check class distritbution

In [23]:
class_distribution = df['Class'].value_counts()

print("Class Distribution:")
print(class_distribution)

Class Distribution:
Class
2    7088
1    4763
0    2486
Name: count, dtype: int64
