In [1]:
import numpy as np
import numpy.linalg as la
import csv
import pandas as pd

def read_file (filename):
    mat = []
    with open(filename, newline = '') as file:
        csv_reader = csv.reader(file, delimiter=',', quotechar='|')
        next(csv_reader)
        for line in csv_reader:
            row = []
            for entry in line:
                row.append(float(entry))
            mat.append(row)
    return np.matrix(mat)

iris_0 = read_file('iris.csv')
iris_1 = read_file('dataI.csv')
iris_2 = read_file('dataII.csv')
iris_3 = read_file('dataIII.csv')
iris_4 = read_file('dataIV.csv')
iris_5 = read_file('dataV.csv')
irises = [iris_1, iris_2, iris_3, iris_4, iris_5]

In [2]:
def reconstruct_self(data, n_comp = 4):
    mean = np.mean(data, axis = 0)
    data_cntr = data - mean
    cov = np.cov(data_cntr.T)
    eig_val, eig_vec = la.eig(cov)
    idx = eig_val.argsort()[::-1]   
    eig_val = eig_val[idx]
    eig_vec = eig_vec[:,idx]
    pc = eig_vec[:, 0:n_comp]
    result = (pc@(pc.T@data_cntr.T)).T + mean
    return result, eig_val, eig_vec

In [3]:
def reconstruct_orig(data_0, data_1, n_comp = 4):
    mean_0 = np.mean(data_0, axis = 0)
    data_0_cntr = data_0 - mean_0
    mean_1 = np.mean(data_1, axis = 0)
    data_1_cntr = data_1 - mean_1
    
    cov_0 = np.cov(data_0_cntr.T)
    eig_val_0, eig_vec_0 = la.eig(cov_0)
    idx = eig_val_0.argsort()[::-1]   
    eig_val_0 = eig_val_0[idx]
    eig_vec_0 = eig_vec_0[:,idx]
    pc = eig_vec_0[:, 0:n_comp]
    result = (pc@(pc.T@data_1_cntr.T)).T + mean_1
    return result

In [7]:
np.set_printoptions(precision=4)
mse_table = np.zeros(shape = (5, 10))

for n in range(5):
    data = irises[n]
    
    mse = np.square(np.subtract(np.mean(iris_0, axis = 0), data)).mean()*4
    mse_table[n][0] = mse
    
    for i in range(1, 5):
        rec = reconstruct_orig(iris_0, data, i)
        mse = np.square(np.subtract(rec, data)).mean()*4
        mse_table[n][i] = mse
    print()
    
    
    rec, eig_val, eig_vec = reconstruct_self(data, 4)
    mse = np.square(np.subtract(np.mean(data, axis = 0), data)).mean()*4
    mse_table[n][5] = mse
#     print(mse)
#     print(np.sum(eig_val))
#     print()
    print(np.mean(data, axis = 0)[0])
    
    for i in range(1, 5):
        rec, eig_val, eig_vec = reconstruct_self(data, i)
        print(rec[0])
        mse = np.square(np.subtract(rec, data)).mean()*4
        mse_table[n][i + 5] = mse
#         print(mse)
#         print(np.sum(eig_val[i:5]))
#         print()
#     print(eig_val)
#     print()

iris_2_recon = np.array(reconstruct_self(iris_2, 2)[0])


[[ 5.8651  3.055   3.7562  1.2122]]
[[ 4.8403  3.2858  1.4102  0.2176]]
[[ 5.0565  3.5445  1.3608  0.1714]]
[[ 4.9974  3.606   1.3664  0.2333]]
[[ 4.9524  3.64    1.4242  0.1512]]

[[ 5.8589  3.1125  3.6969  1.1921]]
[[ 4.6026  3.3859  0.9363  0.0704]]
[[ 4.6246  3.4261  0.9287  0.0745]]
[[ 5.2522  3.0388  0.6457 -0.0264]]
[[ 5.2074  2.9746  0.4518  0.4853]]

[[ 5.8808  2.9499  3.7542  1.1542]]
[[ 4.3952  3.2045  0.691   0.1007]]
[[ 4.0023  2.7449  0.9164 -0.112 ]]
[[ 4.0065  2.0034  0.4677  1.0076]]
[[ 4.9114  1.3423  0.1671  0.4458]]

[[ 5.7353  3.0573  3.666   1.1393]]
[[ 3.7758  3.4387 -0.2815 -0.333 ]]
[[ 0.1224  2.742   1.3912 -0.1357]]
[[ 0.0541  2.9131  1.2541  0.3672]]
[[  2.6645e-15   3.5000e+00   1.4000e+00   2.0000e-01]]

[[ 5.3107  2.8933  3.5253  1.0867]]
[[ 4.3171  3.0391  2.1387  0.6356]]
[[ 5.1672  3.0688  1.5873  0.468 ]]
[[  5.0994e+00  -2.7344e-03   1.3808e+00   2.5952e-01]]
[[  5.1000e+00   4.4409e-16   1.4000e+00   2.0000e-01]]


In [8]:
# import csv
# res = [['0N', '1N', '2N', '3N', '4N', '0c', '1c', '2c', '3c', '4c']]
# for row in mse_table:
#     res.append(row)
    
# with open("jpan22-numbers.csv",'w', newline='') as resultFile:
#     wr = csv.writer(resultFile)
#     wr.writerows(res)

# res = [['Sepal.Length','Sepal.Width','Petal.Length','Petal.Width']]
# for row in iris_2_recon:
#     res.append(row)

# with open("jpan22-recon.csv",'w', newline='') as resultFile:
#     wr = csv.writer(resultFile)
#     wr.writerows(res)

In [11]:
numbers = pd.DataFrame(mse_table, columns = ['0N', '1N', '2N', '3N', '4N', '0c', '1c', '2c', '3c', '4c'])
numbers.to_csv('jpan22-numbers.csv', float_format = '%.3f', index = False)
numbers = pd.DataFrame(iris_2_recon, columns = ['X1', 'X2', 'X3', 'X4'])
numbers.to_csv('jpan22-recon.csv', float_format = '%.3f', index = False)

In [7]:
mean_0 = np.mean(iris_0, axis = 0)
mean_1 = np.mean(iris_1, axis = 0)
print(mean_1)
iris_0_m = iris_0 - mean_0
iris_1_m = iris_1 - mean_1
# print(iris_0 - mean_0)

[[ 5.8651  3.055   3.7562  1.2122]]


In [8]:
cov_0 = np.cov(iris_0_m.T)
cov_1 = np.cov(iris_1_m.T)
print(cov_1)

[[ 0.7536 -0.0385  1.2898  0.5155]
 [-0.0385  0.2512 -0.3193 -0.1271]
 [ 1.2898 -0.3193  3.0775  1.2811]
 [ 0.5155 -0.1271  1.2811  0.6331]]


In [68]:
Dataset 1
5.865102 3.055011 3.756222 1.212214
4.8403366 3.2857853 1.4101974 0.2175673 
5.056522 3.544493 1.360780 0.171419 
4.9974172 3.6059949 1.3664187 0.2332827 
4.9524314 3.6399886 1.4241958 0.1512418 
Dataset 2
5.858884 3.112452 3.696876 1.192108
4.60259078 3.38588838 0.93633898 0.07039917 
4.62456632 3.42605349 0.92866838 0.07445531 
5.25215186  3.03882313  0.64568299 -0.02639324 
5.2073501 2.9746407 0.4518116 0.4852577
Dataset 3
5.880815 2.949890 3.754187 1.154181
4.3952141 3.2045005 0.6909603 0.1006563 
4.0023493  2.7448856  0.9164127 -0.1119578 
4.0064782 2.0033990 0.4677494 1.0075537 
4.9114087 1.3422653 0.1671120 0.4458417 
Dataset 4
5.735333 3.057333 3.666000 1.139333
3.7758046  3.4386976 -0.2814564 -0.3329735 
0.1223950  2.7420260  1.3912258 -0.1357079 
0.05412632 2.91312568 1.25406793 0.36721076 
1.179612e-16 3.500000e+00 1.400000e+00 2.000000e-01 
Dataset 5
5.310667 2.893333 3.525333 1.086667
4.3171478 3.0391033 2.1386955 0.6356332 
5.167191 3.068787 1.587304 0.467959 
5.099379645 -0.002734424  1.380795998  0.259522708 
5.100000e+00 -1.301043e-16  1.400000e+00  2.000000e-01 

SyntaxError: invalid syntax (<ipython-input-68-2e45c163b84a>, line 1)

In [101]:
eig_val, eig_vec = la.eig(cov_1)
print(eig_val)
print(eig_vec)
data_re = (eig_vec[:, 0:2]@(eig_vec[:, 0:2].T@iris_1_m.T)).T + mean_1
print(data_re[0])
print(data_re.shape)

[ 4.215455  0.302644  0.131081  0.066233]
[[ 0.371806 -0.628707 -0.560118  0.390839]
 [-0.08373  -0.75237   0.582841 -0.295339]
 [ 0.851185  0.143716  0.053438 -0.50197 ]
 [ 0.360878  0.134208  0.586266  0.712774]]
[[ 5.056522  3.544493  1.36078   0.171419]]
(150, 4)


[ 96.20228289  17.73518629   4.45390782   3.16566773]
[[-0.75261193 -0.37922179 -0.51109429 -0.16896375]
 [ 0.27637467  0.55419694 -0.70175345 -0.35217168]
 [ 0.49496094 -0.66533082 -0.05120789 -0.55653062]
 [ 0.33497164 -0.32617709 -0.49366231  0.73328032]]


[[ 5.84333333  3.05733333  3.758       1.19933333]]
[[ 4.87332632  3.28420238  1.45858847  0.23764012]]
[[ 5.08303897  3.51741393  1.40321372  0.21353169]]
[[ 5.09928623  3.50072335  1.40108561  0.1982949 ]]
[[ 5.1  3.5  1.4  0.2]]


In [None]:
Dataset 1
5.865102 3.055011 3.756222 1.212214
4.8403366 3.2857853 1.4101974 0.2175673 
5.056522 3.544493 1.360780 0.171419 
4.9974172 3.6059949 1.3664187 0.2332827 
4.9524314 3.6399886 1.4241958 0.1512418 
Dataset 2
5.858884 3.112452 3.696876 1.192108
4.60259078 3.38588838 0.93633898 0.07039917 
4.62456632 3.42605349 0.92866838 0.07445531 
5.25215186  3.03882313  0.64568299 -0.02639324 
5.2073501 2.9746407 0.4518116 0.4852577
Dataset 3
5.880815 2.949890 3.754187 1.154181
4.3952141 3.2045005 0.6909603 0.1006563 
4.0023493  2.7448856  0.9164127 -0.1119578 
4.0064782 2.0033990 0.4677494 1.0075537 
4.9114087 1.3422653 0.1671120 0.4458417 
Dataset 4
5.735333 3.057333 3.666000 1.139333
3.7758046  3.4386976 -0.2814564 -0.3329735 
0.1223950  2.7420260  1.3912258 -0.1357079 
0.05412632 2.91312568 1.25406793 0.36721076 
1.179612e-16 3.500000e+00 1.400000e+00 2.000000e-01 
Dataset 5
5.310667 2.893333 3.525333 1.086667
4.3171478 3.0391033 2.1386955 0.6356332 
5.167191 3.068787 1.587304 0.467959 
5.099379645 -0.002734424  1.380795998  0.259522708 
5.100000e+00 -1.301043e-16  1.400000e+00  2.000000e-01 