In [175]:
# Data Processing
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sys

# Modelling
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from scipy.stats import randint

In [176]:
# Set resolution
start_values = np.linspace(10, 2000, 200)
start_values = start_values / 20000
#start_values

In [178]:
# Add inputs
threshold = start_values
#[0.01, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]
training_file_path = '/content/output.npy'
test_file_path = '/content/test_noisy_IR.npy'
output_path = '/content/out.csv'

Read in files and find top thousand frequencies

In [158]:
# Load and normalize
training_data_all = np.load(training_file_path)
testing_data_all = np.load(test_file_path)

for q in range(1, training_data_all.shape[0]):
  training_data_all[q] = training_data_all[q] / np.max(training_data_all[q])

for r in range(1, testing_data_all.shape[0]):
  testing_data_all[r] = testing_data_all[r] / np.max(testing_data_all[r])

In [159]:
# Check that number of columns is equal between training and testing data
if training_data_all.shape[1] != testing_data_all.shape[1]:
  sys.exit("Invalid input")

In [160]:
# Grand repositories
frequency_names = []
for fn in range(training_data_all.shape[1]):
  temp_name = f'frequency_{fn}'
  frequency_names.append(temp_name)
frequency_names.append('name')

# Build grand training
grand_train = pd.DataFrame(columns = frequency_names)
divisor = (training_data_all.shape[0] - 1) / 10
progress = 1
while (progress < (int(training_data_all.shape[0]))):
  sorted_indices = np.argsort(training_data_all[progress])
  arranged_frequencies = []
  for t in range(int(training_data_all.shape[1])):
    arranged_frequencies.append(training_data_all[0][sorted_indices[t]])
  arranged_frequencies.append((progress - 1) // divisor)
  grand_train.loc[progress - 1] = arranged_frequencies
  progress = progress + 1

# Build grand testing
grand_test = pd.DataFrame(columns = frequency_names)
divisor = (testing_data_all.shape[0] - 1) / 10
progress = 1
while (progress < (int(testing_data_all.shape[0]))):
  sorted_indices = np.argsort(testing_data_all[progress])
  arranged_frequencies = []
  for t0 in range(int(testing_data_all.shape[1])):
    arranged_frequencies.append(testing_data_all[0][sorted_indices[t0]])
  arranged_frequencies.append((progress - 1) // divisor)
  grand_test.loc[progress - 1] = arranged_frequencies
  progress = progress + 1

In [184]:
### Begin random forest tests
target_train = grand_train['name']
target_test = grand_test['name']
output_frame = pd.DataFrame(columns = ['threshold', 'random_forest', 'adv_random_forest'])

for s in range(len(threshold)):
  output_line = []
  output_line.append(threshold[s])
  quantity_of_admissible_frequencies = threshold[s] * (training_data_all.shape[1])
  admissible_columns = frequency_names[:(int(quantity_of_admissible_frequencies))]

  features_train = grand_train[admissible_columns]
  features_test = grand_test[admissible_columns]

  # Perform first random forest test
  rf0 = RandomForestClassifier()
  rf0.fit(features_train, target_train)
  target_pred0 = rf0.predict(features_test)
  accuracy0 = accuracy_score(target_test, target_pred0)
  output_line.append(accuracy0)

  # Perform enhanced random forest
  param_dist = {'n_estimators': randint(50, 500),
                'max_depth': randint(1, 20)}
  rf1 = RandomForestClassifier()
  rand_search = RandomizedSearchCV(rf1,
                                  param_distributions = param_dist,
                                  n_iter = 5,
                                  cv = 5)
  rand_search.fit(features_train, target_train)
  best_rf = rand_search.best_estimator_
  target_pred1 = best_rf.predict(features_test)
  accuracy1 = accuracy_score(target_test, target_pred1)
  output_line.append(accuracy1)
  output_frame.loc[s] = output_line

In [179]:
output_frame.to_csv(output_path, index = False)