# Сравнение FisherS и cPCA

In [None]:
!pip3 install scikit-dimension

In [1]:
import numpy as np
import skdim 
from matplotlib import pyplot as plt
import estimators

In [48]:
def FisherS_n(data):
    n = np.zeros(20)
    al = np.zeros(20)
    count = 0
    for alpha in np.linspace(0.1, 1, 20):
        fisherS = estimators.FisherS()
        fisherS.fit(data, alpha=alpha)
        n[count] = fisherS.dimension_
        al[count] = alpha
        count += 1
    n[n == np.inf] = float('nan')
    inds = np.where(~np.isnan(n))[0]
    alpha_max = max(al[inds])
    alpha_ref = alpha_max * 0.9
    k = np.where(abs(al[inds]-alpha_ref)== min(abs(al-alpha_ref)))[0]
    alfa_single_estimate = al[inds[k]]
    n_single_estimate = n[inds[k]]
    
    return n_single_estimate[0]

def cPCA_n(data):
    mx = 0
    for k in np.linspace(20, 200, 10):
        cpca = estimators.cPCA()
        cpca.fit(data, noise=True, n_neighbors=int(k))
        mx = max(mx, cpca.dimension_)
    return mx

In [47]:
benchmark = skdim.datasets.BenchmarkManifolds(random_state=0)
dict_data = benchmark.generate()
truth = benchmark.truth
datasets = ['M1_Sphere','M2_Affine_3to5','M3_Nonlinear_4to6','M4_Nonlinear','M5a_Helix1d','M5b_Helix2d','M6_Nonlinear','M7_Roll','M8_Nonlinear','M9_Affine','M10a_Cubic','M10b_Cubic','M10c_Cubic','M10d_Cubic','M11_Moebius','M12_Norm','M13a_Scurve','M13b_Spiral','Mbeta','Mn1_Nonlinear','Mn2_Nonlinear','Mp1_Paraboloid','Mp2_Paraboloid','Mp3_Paraboloid']
truth

Unnamed: 0,Intrinsic Dimension,Number of variables,Description
M1_Sphere,10,11,10D sphere linearly embedded
M2_Affine_3to5,3,5,Affine space
M3_Nonlinear_4to6,4,6,"Concentrated figure, mistakable with a 3D one"
M4_Nonlinear,4,8,Nonlinear manifold
M5a_Helix1d,1,3,1D helix
M5b_Helix2d,2,3,2D helix
M6_Nonlinear,6,36,Nonlinear manifold
M7_Roll,2,3,Swiss Roll
M8_Nonlinear,12,72,Nonlinear (highly curved) manifold
M9_Affine,20,20,Affine space


### Тестирование на данных без шума

In [None]:
ans = []
for name in datasets[-1:]:
    cur = []
    cur.append(name)
    cur.append(truth['Intrinsic Dimension'][name])
    cur.append(truth['Number of variables'][name])
    cur.append(truth['Description'][name])
    n1 = FisherS_n(dict_data[name])
    n2 = cPCA_n(dict_data[name])
    cur.append(n1)
    cur.append(n2)
    print(' ', end='')
    print(truth['Intrinsic Dimension'][name], n1, n2)
    ans.append(cur)

In [59]:
for it in ans:
    full_ans.append(it)

In [64]:
full_ans

[['M1_Sphere',
  10,
  11,
  '10D sphere linearly embedded',
  11.012724866466037,
  8.951219512195122],
 ['M2_Affine_3to5', 3, 5, 'Affine space', 2.6684403344421574, 2.0],
 ['M3_Nonlinear_4to6',
  4,
  6,
  'Concentrated figure, mistakable with a 3D one',
  2.827756360419468,
  3.5476190476190474],
 ['M4_Nonlinear',
  4,
  8,
  'Nonlinear manifold',
  5.917228355465528,
  5.0476190476190474],
 ['M5b_Helix2d', 2, 3, '2D helix', 2.685367544867946, 2.0],
 ['M6_Nonlinear',
  6,
  36,
  'Nonlinear manifold',
  8.563358568964365,
  10.31578947368421],
 ['M7_Roll', 2, 3, 'Swiss Roll', 2.8918529551551084, 1.6666666666666667],
 ['M8_Nonlinear',
  12,
  72,
  'Nonlinear (highly curved) manifold',
  17.86258345351405,
  20.988235294117647],
 ['M9_Affine', 20, 20, 'Affine space', 18.652370706426296, 14.919540229885058],
 ['M10a_Cubic', 10, 11, '10D hypercube', 10.374142520010693, 8.45],
 ['M10b_Cubic', 17, 18, '17D hypercube', 16.97885664535799, 13.63013698630137],
 ['M10c_Cubic',
  24,
  25,
  '

In [73]:
for it in full_ans:
    print(it[0], len(dict_data[it[0]]), it[1], it[2], round(it[4], 2), round(it[5], 2), sep=" & ", end="\\\\\n")

M1_Sphere & 2500 & 10 & 11 & 11.01 & 8.95\\
M2_Affine_3to5 & 2500 & 3 & 5 & 2.67 & 2.0\\
M3_Nonlinear_4to6 & 2500 & 4 & 6 & 2.83 & 3.55\\
M4_Nonlinear & 2500 & 4 & 8 & 5.92 & 5.05\\
M5b_Helix2d & 2500 & 2 & 3 & 2.69 & 2.0\\
M6_Nonlinear & 2500 & 6 & 36 & 8.56 & 10.32\\
M7_Roll & 2500 & 2 & 3 & 2.89 & 1.67\\
M8_Nonlinear & 2500 & 12 & 72 & 17.86 & 20.99\\
M9_Affine & 2500 & 20 & 20 & 18.65 & 14.92\\
M10a_Cubic & 2500 & 10 & 11 & 10.37 & 8.45\\
M10b_Cubic & 2500 & 17 & 18 & 16.98 & 13.63\\
M10c_Cubic & 2500 & 24 & 25 & 23.46 & 18.87\\
M10d_Cubic & 2500 & 70 & 71 & 69.42 & 48.66\\
M11_Moebius & 2500 & 2 & 3 & 1.98 & 2.0\\
M12_Norm & 2500 & 20 & 20 & 19.97 & 15.02\\
M13a_Scurve & 2500 & 2 & 3 & 2.42 & 1.0\\
Mbeta & 2500 & 10 & 40 & 5.34 & 9.32\\
Mn1_Nonlinear & 2500 & 18 & 72 & 16.65 & 18.0\\
Mn2_Nonlinear & 2500 & 24 & 96 & 22.18 & 24.0\\
Mp1_Paraboloid & 2500 & 3 & 12 & 0.92 & 2.0\\


### Считаем Mean%error для данных без шума 

In [74]:
sumCPCA = 0
sumFISHERS = 0
for it in full_ans:
    sumCPCA += abs(it[2] - it[5]) / it[2]
    sumFISHERS += abs(it[2] - it[4]) / it[2]
sumCPCA *= (100 / len(full_ans))
sumFISHERS *= (100 / len(full_ans))
print("sumCPCA = ", sumCPCA)
print("sumFISHERS = ", sumFISHERS)

sumCPCA =  47.00835783658234
sumFISHERS =  35.19803769417695


### Считаем для выборок со слабым шумом $\sigma = 0.05$

In [75]:
benchmark = skdim.datasets.BenchmarkManifolds(random_state=0)
dict_data = benchmark.generate(noise=0.05)
names = []
for it in full_ans:
    names.append(it[0])

In [76]:
ans = []
for name in names:
    cur = []
    cur.append(name)
    cur.append(truth['Intrinsic Dimension'][name])
    cur.append(truth['Number of variables'][name])
    cur.append(truth['Description'][name])
    n1 = FisherS_n(dict_data[name])
    n2 = cPCA_n(dict_data[name])
    cur.append(n1)
    cur.append(n2)
    print(' ', end='')
    print(truth['Intrinsic Dimension'][name], n1, n2)
    ans.append(cur)

1111111111 10 10.980291570559249 9.0
1111111111 3 2.6681642717173535 2.0
1111111111 4 2.829055638736338 3.5813953488372094
1111111111 4 5.92266258779184 5.048780487804878
1111111111 2 2.685660281042703 2.0
1111111111 6 8.570221655464495 10.345454545454546
1111111111 2 2.8916759522632285 1.6666666666666667
1111111111 12 17.94523123847294 21.0
1111111111 20 18.677908174152037 14.939759036144578
1111111111 10 10.353560669269397 8.394366197183098
1111111111 17 17.008050149643285 13.569444444444445
1111111111 24 23.507807211704545 18.88888888888889
1111111111 70 69.42138224513377 48.70967741935484
1111111111 2 1.9825919043923022 2.0
1111111111 20 19.939813290164143 14.991596638655462
1111111111 2 2.4161455675506183 1.0
1111111111 10 5.365715860642667 9.564516129032258
1111111111 18 16.574152766103175 18.0
1111111111 24 22.2828496060299 24.0
1111111111 3 0.9150987306409786 2.540084388185654


In [77]:
for it in ans:
    print(it[0], len(dict_data[it[0]]), it[1], it[2], round(it[4], 2), round(it[5], 2), sep=" & ", end="\\\\\n")

M1_Sphere & 2500 & 10 & 11 & 10.98 & 9.0\\
M2_Affine_3to5 & 2500 & 3 & 5 & 2.67 & 2.0\\
M3_Nonlinear_4to6 & 2500 & 4 & 6 & 2.83 & 3.58\\
M4_Nonlinear & 2500 & 4 & 8 & 5.92 & 5.05\\
M5b_Helix2d & 2500 & 2 & 3 & 2.69 & 2.0\\
M6_Nonlinear & 2500 & 6 & 36 & 8.57 & 10.35\\
M7_Roll & 2500 & 2 & 3 & 2.89 & 1.67\\
M8_Nonlinear & 2500 & 12 & 72 & 17.95 & 21.0\\
M9_Affine & 2500 & 20 & 20 & 18.68 & 14.94\\
M10a_Cubic & 2500 & 10 & 11 & 10.35 & 8.39\\
M10b_Cubic & 2500 & 17 & 18 & 17.01 & 13.57\\
M10c_Cubic & 2500 & 24 & 25 & 23.51 & 18.89\\
M10d_Cubic & 2500 & 70 & 71 & 69.42 & 48.71\\
M11_Moebius & 2500 & 2 & 3 & 1.98 & 2.0\\
M12_Norm & 2500 & 20 & 20 & 19.94 & 14.99\\
M13a_Scurve & 2500 & 2 & 3 & 2.42 & 1.0\\
Mbeta & 2500 & 10 & 40 & 5.37 & 9.56\\
Mn1_Nonlinear & 2500 & 18 & 72 & 16.57 & 18.0\\
Mn2_Nonlinear & 2500 & 24 & 96 & 22.28 & 24.0\\
Mp1_Paraboloid & 2500 & 3 & 12 & 0.92 & 2.54\\


### Считаем Mean%error для данных со слабым шумом

In [78]:
sumCPCA = 0
sumFISHERS = 0
for it in ans:
    sumCPCA += abs(it[2] - it[5]) / it[2]
    sumFISHERS += abs(it[2] - it[4]) / it[2]
sumCPCA *= (100 / len(full_ans))
sumFISHERS *= (100 / len(full_ans))
print("sumCPCA = ", sumCPCA)
print("sumFISHERS = ", sumFISHERS)

sumCPCA =  46.73307404581416
sumFISHERS =  35.17929154208292


### Считаем для выборок с сильным шумом $\sigma = 0.5$

In [80]:
benchmark = skdim.datasets.BenchmarkManifolds(random_state=0)
dict_data = benchmark.generate(noise=0.5)

In [81]:
ans3 = []
for name in names:
    cur = []
    cur.append(name)
    cur.append(truth['Intrinsic Dimension'][name])
    cur.append(truth['Number of variables'][name])
    cur.append(truth['Description'][name])
    n1 = FisherS_n(dict_data[name])
    n2 = cPCA_n(dict_data[name])
    cur.append(n1)
    cur.append(n2)
    print(' ', end='')
    print(truth['Intrinsic Dimension'][name], n1, n2)
    ans3.append(cur)

1111111111 10 10.992676799155806 8.676923076923076
1111111111 3 2.669304992020083 3.0040983606557377
1111111111 4 2.970869208083913 4.189542483660131
1111111111 4 6.698124754129309 5.32
1111111111 2 2.6853025032460622 2.0
1111111111 6 9.065491544199551 15.292817679558011
1111111111 2 2.8919267147759067 1.6296296296296295
1111111111 12 18.373980026116318 28.912408759124087
1111111111 20 18.79674760702822 14.964285714285714
1111111111 10 10.644455634329049 8.212121212121213
1111111111 17 17.494665058932938 14.0
1111111111 24 25.303074474605033 18.694736842105264
1111111111 70 70.34360328922374 48.625
1111111111 2 2.3038790965504266 2.0
1111111111 20 20.119889163234813 14.99056603773585
1111111111 2 2.4366884856694107 2.0
1111111111 10 15.45038913046026 24.785714285714285
1111111111 18 18.72991429014559 37.316326530612244
1111111111 24 25.45564416684319 49.62637362637363
1111111111 3 0.9157564676556983 7.901408450704225


In [82]:
for it in ans3:
    print(it[0], len(dict_data[it[0]]), it[1], it[2], round(it[4], 2), round(it[5], 2), sep=" & ", end="\\\\\n")

M1_Sphere & 2500 & 10 & 11 & 10.99 & 8.68\\
M2_Affine_3to5 & 2500 & 3 & 5 & 2.67 & 3.0\\
M3_Nonlinear_4to6 & 2500 & 4 & 6 & 2.97 & 4.19\\
M4_Nonlinear & 2500 & 4 & 8 & 6.7 & 5.32\\
M5b_Helix2d & 2500 & 2 & 3 & 2.69 & 2.0\\
M6_Nonlinear & 2500 & 6 & 36 & 9.07 & 15.29\\
M7_Roll & 2500 & 2 & 3 & 2.89 & 1.63\\
M8_Nonlinear & 2500 & 12 & 72 & 18.37 & 28.91\\
M9_Affine & 2500 & 20 & 20 & 18.8 & 14.96\\
M10a_Cubic & 2500 & 10 & 11 & 10.64 & 8.21\\
M10b_Cubic & 2500 & 17 & 18 & 17.49 & 14.0\\
M10c_Cubic & 2500 & 24 & 25 & 25.3 & 18.69\\
M10d_Cubic & 2500 & 70 & 71 & 70.34 & 48.62\\
M11_Moebius & 2500 & 2 & 3 & 2.3 & 2.0\\
M12_Norm & 2500 & 20 & 20 & 20.12 & 14.99\\
M13a_Scurve & 2500 & 2 & 3 & 2.44 & 2.0\\
Mbeta & 2500 & 10 & 40 & 15.45 & 24.79\\
Mn1_Nonlinear & 2500 & 18 & 72 & 18.73 & 37.32\\
Mn2_Nonlinear & 2500 & 24 & 96 & 25.46 & 49.63\\
Mp1_Paraboloid & 2500 & 3 & 12 & 0.92 & 7.9\\


### Считаем Mean%error для данных с сильным шумом

In [84]:
sumCPCA = 0
sumFISHERS = 0
for it in ans3:
    sumCPCA += abs(it[2] - it[5]) / it[2]
    sumFISHERS += abs(it[2] - it[4]) / it[2]
sumCPCA *= (100 / len(full_ans))
sumFISHERS *= (100 / len(full_ans))
print("sumCPCA = ", sumCPCA)
print("sumFISHERS = ", sumFISHERS)

sumCPCA =  35.54748861991684
sumFISHERS =  31.740816708706348
