In [1]:
import numpy as np
import pandas as pd
from statistics import mean
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Set the path to the Excel sheets containing UMAP test set accuracies
mushroom_path = 'assets/UMAP/Mushroom.xlsx'
digits_path = 'assets/UMAP/Digits.xlsx'
forest_path = 'assets/UMAP/Forest.xlsx'
car_path = 'assets/UMAP/Car.xlsx'

In [3]:
# Set the variable for the number of neighbours used for each experiment
n_neighbours = ['3', '5', '10', '25', '35', '45']

In [4]:
# Create a dictionary containing number of neighbours as the key and the test set
# accuracy for UMAP with that number of neighbours as the value
mushroom_dataframes = {}
for n in n_neighbours:
    mushroom_dataframes[n] = pd.read_excel(mushroom_path, sheet_name=n)

In [5]:
# Test dictionary by viewing UMAP results with number of neighbours = 45
mushroom_dataframes['45']

Unnamed: 0,Dimensionality,"Accuracy [Z, ZZ]",Accuracy [‘ZZ’],Accuracy [‘Z’],"Accuracy [Y, YY]",Accuracy RBF
0,2,0.85,0.62,0.79,0.62,0.79
1,3,0.81,0.68,0.79,0.4,0.85
2,4,0.87,0.87,0.79,0.32,0.91
3,5,0.74,0.77,0.96,0.49,0.96
4,6,0.83,0.87,0.85,0.51,0.91
5,7,0.83,0.89,0.96,0.66,0.96


In [6]:
# Create a dictionary containing number of neighbours as the key and the test set
# accuracy for UMAP with that number of neighbours as the value
digits_dataframes = {}
for n in n_neighbours:
    digits_dataframes[n] = pd.read_excel(digits_path, sheet_name=n)

In [7]:
# Create a dictionary containing number of neighbours as the key and the test set
# accuracy for UMAP with that number of neighbours as the value
forest_dataframes = {}
for n in n_neighbours:
    forest_dataframes[n] = pd.read_excel(forest_path, sheet_name=n)

In [8]:
# Create a dictionary containing number of neighbours as the key and the test set
# accuracy for UMAP with that number of neighbours as the value
car_dataframes = {}
for n in n_neighbours:
    car_dataframes[n] = pd.read_excel(car_path, sheet_name=n)

## Regression Analysis

#### Car Evaluation Dataset

In [9]:
# Iterate over the dataframes containing the UMAP results for each different
# number of neighbours value
for neighbours, df in car_dataframes.items():
    # Remove nan row for concat
    df.drop(df.tail(1).index,inplace=True)
    # Set the neighbours value to be an integer instead of a string for regression
    df['Neighbours'] = int(neighbours)

# Concat the dataframes for different number of neighbours values into one dataframe

reg_df_car = pd.concat(car_dataframes.values(), ignore_index=True, axis=0)
reg_df_car

Unnamed: 0,Dimensionality,"Accuracy [Z, ZZ]",Accuracy [‘ZZ’],Accuracy [‘Z’],"Accuracy [Y, YY]",Accuracy RBF,Neighbours
0,2,0.71,0.57,0.69,0.64,0.71,3
1,3,0.76,0.71,0.55,0.6,0.79,3
2,4,0.71,0.71,0.67,0.57,0.71,3
3,5,0.71,0.74,0.64,0.71,0.64,3
4,6,0.74,0.79,0.74,0.76,0.73,3
5,2,0.64,0.55,0.74,0.62,0.79,5
6,3,0.76,0.76,0.74,0.62,0.83,5
7,4,0.67,0.76,0.69,0.64,0.71,5
8,5,0.81,0.71,0.81,0.73,0.81,5
9,6,0.81,0.81,0.76,0.67,0.76,5


In [10]:
# Drop columns to leave only those containing feature map accuracy
fm_columns = list(reg_df_car.columns)
fm_columns.remove('Dimensionality')
fm_columns.remove('Neighbours')

# Iterate over each Pauli feature map
for column in fm_columns:
    
    # Reshape arrays + set x and y variables for regression
    y = np.array(reg_df_car[column]).reshape(-1, 1) * 100
    x =  np.array(reg_df_car[['Dimensionality', 'Neighbours']])
    x = sm.add_constant(x)
    
    # Run regression equation described in dissertation document
    smodel = sm.OLS(y, x)
    results = smodel.fit()
    print(column)
    print(results.summary())

Accuracy [Z, ZZ]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.612
Model:                            OLS   Adj. R-squared:                  0.583
Method:                 Least Squares   F-statistic:                     21.25
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           2.86e-06
Time:                        15:21:53   Log-Likelihood:                -102.92
No. Observations:                  30   AIC:                             211.8
Df Residuals:                      27   BIC:                             216.0
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         73.4451      4.706   

#### Mushroom Dataset

In [11]:
# Set the neighbours value to be an integer instead of a string for regression
for neighbours, df in mushroom_dataframes.items():
    df['Neighbours'] = int(neighbours)

# Concat the dataframes for different number of neighbours values into one dataframe
reg_df_mushroom = pd.concat(mushroom_dataframes.values(), ignore_index=True, axis=0)
reg_df_mushroom

Unnamed: 0,Dimensionality,"Accuracy [Z, ZZ]",Accuracy [‘ZZ’],Accuracy [‘Z’],"Accuracy [Y, YY]",Accuracy RBF,Neighbours
0,2,0.98,0.85,0.89,0.68,0.94,3
1,3,0.98,0.98,0.94,0.94,0.98,3
2,4,0.94,0.94,0.89,0.94,0.94,3
3,5,0.98,0.98,0.94,0.98,0.94,3
4,6,0.98,0.98,0.94,0.98,0.94,3
5,7,0.98,0.98,0.94,0.98,0.94,3
6,2,0.89,0.68,0.79,0.77,0.85,5
7,3,0.98,0.87,0.96,0.83,0.98,5
8,4,0.98,0.98,0.98,0.98,0.98,5
9,5,0.98,0.98,0.98,0.98,0.98,5


In [12]:
# Drop columns to leave only those containing feature map accuracy
fm_columns = list(reg_df_mushroom.columns)
fm_columns.remove('Dimensionality')
fm_columns.remove('Neighbours')

# Iterate over each Pauli feature map
for column in fm_columns:
    
    # Reshape arrays + set x and y variables for regression
    y = np.array(reg_df_mushroom[column]).reshape(-1, 1) * 100
    x = np.array(reg_df_mushroom[['Dimensionality', 'Neighbours']])
    x = sm.add_constant(x)
    
    # Run regression equation described in dissertation document
    smodel = sm.OLS(y, x)
    results = smodel.fit()
    print('\n', column)
    print(results.summary())


 Accuracy [Z, ZZ]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.710
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     40.42
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           1.34e-09
Time:                        15:21:53   Log-Likelihood:                -100.25
No. Observations:                  36   AIC:                             206.5
Df Residuals:                      33   BIC:                             211.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         91.9477      2.118 

#### Digits Dataset

In [13]:
# Set the neighbours value to be an integer instead of a string for regression
for neighbours, df in digits_dataframes.items():
    df['Neighbours'] = int(neighbours)

# Concat the dataframes for different number of neighbours values into one dataframe
reg_df_digits = pd.concat(digits_dataframes.values(), ignore_index=True, axis=0)
reg_df_digits

Unnamed: 0,Dimensionality,"Accuracy [Z, ZZ]",Accuracy [‘ZZ’],Accuracy [‘Z’],"Accuracy [Y, YY]",Accuracy RBF,Neighbours
0,2,0.91,0.67,0.89,0.59,0.91,3
1,3,0.97,0.81,0.91,0.78,0.93,3
2,4,0.97,0.97,0.9,0.86,0.93,3
3,5,0.97,0.97,0.93,0.93,0.97,3
4,6,0.97,0.97,0.93,0.97,0.97,3
5,7,0.97,0.97,0.97,0.97,0.97,3
6,2,0.9,0.76,0.97,0.43,0.98,5
7,3,0.95,0.76,0.98,0.6,0.98,5
8,4,0.98,0.88,0.98,0.72,0.98,5
9,5,0.98,0.93,0.98,0.78,0.98,5


In [14]:
# Drop columns to leave only those containing feature map accuracy
fm_columns = list(reg_df_digits.columns)
fm_columns.remove('Dimensionality')
fm_columns.remove('Neighbours')

# Iterate over each Pauli feature map
for column in fm_columns:
    
    # Reshape arrays + set x and y variables for regression
    y = np.array(reg_df_digits[column]).reshape(-1, 1) * 100
    x = np.array(reg_df_digits[['Dimensionality', 'Neighbours']])
    x = sm.add_constant(x)
    
    # Run regression equation described in dissertation document
    smodel = sm.OLS(y, x)
    results = smodel.fit()
    print('\n', column)
    print(results.summary())


 Accuracy [Z, ZZ]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.736
Model:                            OLS   Adj. R-squared:                  0.720
Method:                 Least Squares   F-statistic:                     46.00
Date:                Sun, 21 Apr 2024   Prob (F-statistic):           2.86e-10
Time:                        15:21:53   Log-Likelihood:                -109.22
No. Observations:                  36   AIC:                             224.4
Df Residuals:                      33   BIC:                             229.2
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         96.5674      2.717 

#### Forest Covertype Dataset

In [15]:
# Set the neighbours value to be an integer instead of a string for regression
for neighbours, df in forest_dataframes.items():
    df['Neighbours'] = int(neighbours)

# Concat the dataframes for different number of neighbours values into one dataframe
reg_df_forest = pd.concat(forest_dataframes.values(), ignore_index=True, axis=0)
reg_df_forest

Unnamed: 0,Dimensionality,"Accuracy [Z, ZZ]",Accuracy [‘ZZ’],Accuracy [‘Z’],"Accuracy [Y, YY]",Accuracy RBF,Neighbours
0,2,0.66,0.42,0.71,0.54,0.66,3
1,3,0.65,0.68,0.66,0.48,0.66,3
2,4,0.65,0.6,0.6,0.69,0.69,3
3,5,0.68,0.69,0.66,0.71,0.68,3
4,6,0.68,0.68,0.68,0.65,0.66,3
5,7,0.65,0.66,0.63,0.68,0.63,3
6,2,0.69,0.58,0.68,0.52,0.63,5
7,3,0.65,0.66,0.6,0.62,0.65,5
8,4,0.68,0.66,0.68,0.68,0.69,5
9,5,0.66,0.68,0.65,0.66,0.69,5


In [16]:
# Drop columns to leave only those containing feature map accuracy
fm_columns = list(reg_df_forest.columns)
fm_columns.remove('Dimensionality')
fm_columns.remove('Neighbours')

# Iterate over each Pauli feature map
for column in fm_columns:
    
    # Reshape arrays + set x and y variables for regression
    y = np.array(reg_df_forest[column]).reshape(-1, 1) * 100
    x = np.array(reg_df_forest[['Dimensionality', 'Neighbours']])
    x = sm.add_constant(x)
    
    # Run regression equation described in dissertation document
    smodel = sm.OLS(y, x)
    results = smodel.fit()
    print('\n', column)
    print(results.summary())


 Accuracy [Z, ZZ]
                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.239
Model:                            OLS   Adj. R-squared:                  0.192
Method:                 Least Squares   F-statistic:                     5.171
Date:                Sun, 21 Apr 2024   Prob (F-statistic):             0.0111
Time:                        15:21:53   Log-Likelihood:                -102.49
No. Observations:                  36   AIC:                             211.0
Df Residuals:                      33   BIC:                             215.7
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         64.9596      2.254 