#  Creating and applying a random projection
In this file we will create and apply a random projection. Like in the associated paper we will be defining the size of the random project to be the same as an existing featurisation method, but random projections can be as big or small as you'd like.

## First some imports and setting some variables

In [1]:
import pandas as pd
import json
import os
from utilities import featurise_data
import numpy as np

In [2]:
data_folder = 'data/case_studies'
task_info = 'task_info.json'
with open(task_info) as f:
    tasks = json.load(f)

In [3]:
task = 'GFA'
featurisation = 'oliynyk'

## Now lets read in the data we are projection

In [4]:
data_file = os.path.join(data_folder, #were the data is
             'featurised', #whether we are investigating CBFVs or random projections
             tasks[task]['study_folder'], #Which study?
             '80_20_split', #80_20_split or LOCO-CV?
             tasks[task]['type'], #regression or classification?
             tasks[task]['task_folder'], #Which task?
             f'{featurisation}_train_CBFV.csv') #Which file 

df = pd.read_csv(data_file)

In [5]:
n_dims = len(df.drop(['target','formula'],axis=1).columns)
formulae = df[['formula']]
print(f'We will be making a projectioning {len(formulae)} compositions to a representation of size {n_dims}')

We will be making a projectioning 5051 compositions to a representation of size 176


## Now we get the CompVec representation, and create and apply the random projection

In [6]:
compVec = featurise_data(formulae, style='compVec').drop("formula",axis=1)

In [7]:
projection = np.random.normal(loc=0, scale=1/n_dims, size=(compVec.shape[1],n_dims))

In [8]:
projected = compVec @ projection

## Now we have our projected data

In [9]:
projected

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,166,167,168,169,170,171,172,173,174,175
0,0.009293,-0.002455,0.003958,-0.000486,0.010050,-0.009158,-0.003689,-0.001361,0.003349,-0.005590,...,-0.000150,0.005050,-0.005978,-0.004209,0.002962,-0.005419,0.006495,-0.013982,0.001611,-0.006171
1,0.005768,-0.001472,0.002845,0.000688,0.006425,-0.006328,-0.002470,-0.000527,0.001553,-0.001043,...,-0.002557,0.004978,-0.005952,-0.002137,0.002565,-0.004802,0.007849,-0.010283,-0.001220,-0.005590
2,-0.011044,0.001162,0.001711,-0.004702,0.009076,0.004452,0.003450,-0.002403,-0.001033,-0.001785,...,-0.002991,-0.004748,0.002972,0.000753,-0.000123,-0.003288,0.003918,-0.002182,-0.001074,0.002789
3,0.001301,0.005296,0.006077,-0.003180,-0.002557,-0.002640,-0.000643,-0.006853,-0.002225,0.003481,...,0.004980,0.005119,0.002560,-0.000811,-0.006599,-0.003856,-0.002903,-0.008734,0.006924,0.005121
4,-0.000291,0.001302,-0.003208,0.011607,-0.005300,-0.001072,0.007124,0.006697,0.009874,0.002954,...,0.005691,-0.006328,0.001832,0.003398,0.000425,0.000921,0.005170,-0.000845,0.003462,0.004108
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5046,0.000464,0.002843,0.007720,0.003297,0.000221,-0.002629,-0.001977,0.006127,0.001684,0.000783,...,-0.000677,0.005887,-0.004868,-0.006661,-0.006516,-0.006869,-0.003108,0.000080,-0.006514,-0.009923
5047,0.007809,-0.001259,0.002945,-0.000962,0.009295,-0.007597,-0.003798,-0.001090,0.002305,-0.006244,...,-0.000146,0.005227,-0.003762,-0.003036,0.002084,-0.004563,0.005005,-0.013178,0.001578,-0.006830
5048,-0.000524,0.000289,-0.002954,-0.001139,0.002231,0.007949,0.007741,0.003537,0.003825,0.003389,...,0.006323,0.009826,-0.009492,-0.001987,-0.000400,-0.008793,-0.010468,0.001384,-0.001454,0.014595
5049,0.006662,-0.001430,0.003740,-0.003137,0.005656,-0.001831,-0.000160,0.002813,0.000709,-0.002954,...,0.001014,0.002376,-0.006473,-0.004223,0.001323,-0.003772,0.003044,-0.005218,-0.001462,0.001920
