In [None]:
### Import relevant packages

# Standard imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random

# Imports specific to random forest analysis
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint
from sklearn.tree import export_graphviz
from IPython.display import Image
import graphviz

In [None]:
# Define function to determine if an item is a member of a set
def inset(element, collection):
  flag = 0
  for elt in collection:
    if element == elt:
      flag = flag + 1
      break
  if flag == 0:
    return False
  else:
    return True

In [None]:
### Assemble a data frame with all IR data

# Molecule to number key
# C2H2   0
# CH4    1
# NH3    2

# Load in and label IR data
c2h2_unprocessed = np.load('/content/C2H2_Data_6_27.npy')
c2h2_normalized = np.abs(c2h2_unprocessed)
c2h2_names = np.full(shape = c2h2_normalized.shape[0], fill_value = 0)
c2h2_names = c2h2_names.reshape((c2h2_normalized.shape[0], 1))
c2h2_verified = np.concatenate((c2h2_normalized, c2h2_names), axis = 1)

ch4_unprocessed = np.load('/content/CH4_Data_6_27.npy')
ch4_normalized = np.abs(ch4_unprocessed)
ch4_names = np.full(shape = ch4_normalized.shape[0], fill_value = 1)
ch4_names = ch4_names.reshape((ch4_normalized.shape[0], 1))
ch4_verified = np.concatenate((ch4_normalized, ch4_names), axis = 1)

nh3_unprocessed = np.load('/content/NH3_Data_6_27.npy')
nh3_normalized = np.abs(nh3_unprocessed)
nh3_names = np.full(shape = nh3_normalized.shape[0], fill_value = 2)
nh3_names = nh3_names.reshape((nh3_normalized.shape[0], 1))
nh3_verified = np.concatenate((nh3_normalized, nh3_names), axis = 1)

# Prepare Pandas dataframe
column_names = []
column_names_wo_name = []
for i in range(20000):
  title = f'channel_{i}'
  column_names.append(title)
  column_names_wo_name.append(title)
column_names.append('molecule')

full_data = pd.DataFrame(columns = column_names)

progress = 0
while (progress < 100):
  full_data.loc[progress] = c2h2_verified[progress]
  progress = progress + 1

while (progress < 201):
  full_data.loc[progress] = ch4_verified[progress - 100]
  progress = progress + 1

while (progress < 300):
  full_data.loc[progress] = nh3_verified[progress - 201]
  progress = progress + 1

In [None]:
# Determine key parameters
channel_quantity = [3]
training_data_size_per_molecule = [3]   #, 5, 7, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90, 95]
channel_samples = 1     # Each sample contains channel_quantity channels, but different channels in different samples
row_samples = 1         # Each sample contains training_data_size_per_molecule rows, but different rows in different samples

output = pd.DataFrame(columns = ['channel_quantity', 'channel_sample', 'channels_used', 'training_size', 'row_sample', 'rows_used', 'accuracy'])
progress = 0

for cq in channel_quantity:
  for t in range(channel_samples):
    # For each iteration of the loop, select the cq channels that will be examined
    sample_key = np.sort(random.sample(range(c2h2_normalized.shape[1]), cq))
    channel_names_w_label = []
    channel_names_wo_label = []
    for k in sample_key:
      channel_name = f'channel_{k}'
      channel_names_w_label.append(channel_name)
      channel_names_wo_label.append(channel_name)
    channel_names_w_label.append('molecule')
    selected_channels = full_data[channel_names_w_label]

    for td in training_data_size_per_molecule:
      for u in range(row_samples):
        # For each iteration of the loop, select the td rows that will be examined for each molecule
        c2h2_samples = np.sort(random.sample(range(100), td))
        ch4_samples = np.sort(random.sample(range(100, 201), td))
        nh3_samples = np.sort(random.sample(range(201, 300), td))

        selected_rows = pd.DataFrame(columns = channel_names_w_label)
        deselected_rows = pd.DataFrame(columns = channel_names_w_label)

        row_building_selected_rows = 0
        row_building_deselected_rows = 0
        grand_advancement = 0

        # Build the data frames with the selected rows and the deselected rows
        while (grand_advancement < 100):
          if inset(grand_advancement, c2h2_samples):
            selected_rows.loc[row_building_selected_rows] = selected_channels.loc[grand_advancement]
            row_building_selected_rows = row_building_selected_rows + 1
          else:
            deselected_rows.loc[row_building_deselected_rows] = selected_channels.loc[grand_advancement]
            row_building_deselected_rows = row_building_deselected_rows + 1
          grand_advancement = grand_advancement + 1

        while (grand_advancement < 201):
          if inset(grand_advancement, ch4_samples):
            selected_rows.loc[row_building_selected_rows] = selected_channels.loc[grand_advancement]
            row_building_selected_rows = row_building_selected_rows + 1
          else:
            deselected_rows.loc[row_building_deselected_rows] = selected_channels.loc[grand_advancement]
            row_building_deselected_rows = row_building_deselected_rows + 1
          grand_advancement = grand_advancement + 1

        while (grand_advancement < 300):
          if inset(grand_advancement, nh3_samples):
            selected_rows.loc[row_building_selected_rows] = selected_channels.loc[grand_advancement]
            row_building_selected_rows = row_building_selected_rows + 1
          else:
            deselected_rows.loc[row_building_deselected_rows] = selected_channels.loc[grand_advancement]
            row_building_deselected_rows = row_building_deselected_rows + 1
          grand_advancement = grand_advancement + 1

        # Split into feature and target columns
        feature_train = selected_rows.drop('molecule', axis = 1)
        target_train = selected_rows['molecule']

        feature_test = deselected_rows.drop('molecule', axis = 1)
        target_test = deselected_rows['molecule']

        # Use the random forest to make predictions
        rf = RandomForestClassifier()
        rf.fit(feature_train, target_train)
        predictions = rf.predict(feature_test)
        accuracy = accuracy_score(target_test, predictions)

        # Write results to an ouput file
        output_line = []
        output_line.append(cq)
        output_line.append(t)
        output_line.append(sample_key)
        output_line.append(td)
        output_line.append(u)

        row_collection = []
        row_collection.append(c2h2_samples)
        row_collection.append(ch4_samples)
        row_collection.append(nh3_samples)
        output_line.append(row_collection)

        output_line.append(accuracy)
        output.loc[progress] = output_line
        progress = progress + 1

In [None]:
# Write output to a downloadable file
output.to_csv('/content/output.csv', index = False)