In [27]:
import tensorflow as tf
import numpy as np
import pandas as pd
from keras.models import Model, load_model
from keras.layers import Input, Dense
import warnings
warnings.filterwarnings('ignore')

In [28]:
df = pd.read_csv('Data/PCA_Normalized.csv', index_col = 0)

In [29]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,BBLE
0,0.535946,-0.340776,-0.37895,-0.409478,0.385434,0.752969,-0.966496,-0.007804,1000010101
1,4.41,15.830565,7.294441,-3.931946,1.580675,15.990311,4.17777,1.256324,1000010201
2,-0.002913,0.070488,-0.009502,0.081313,-0.012082,-0.091095,-0.04743,0.064557,1000020001
3,0.035085,-0.062495,-0.065342,0.002053,-0.036879,-0.070413,-0.107799,-0.015961,1000020023
4,7.99561,-3.547684,-2.722607,-4.061353,-10.11904,9.130893,-10.615977,-13.862914,1000030001


In [30]:
sum(sorted(df['BBLE'].value_counts()))

1070994

In [31]:
mat = df.iloc[:,:-1].values.astype(np.float32)

In [32]:
mat.shape

(1070994, 8)

In [33]:
input_dim = mat.shape[1]  # 设定输入层
encoding_dim = 4  # 设定编码层

input_layer = Input(shape=(input_dim, ))  # 添加输入层

encoder = Dense(encoding_dim, activation="sigmoid")(input_layer)  # 编码器输入层使用tanh激活函数

decoder = Dense(input_dim, activation='sigmoid')(encoder)  # 解码器输入层使用tanh激活函数

autoencoder = Model(inputs=input_layer, outputs=decoder)  # 创建自动编码模型

In [34]:
nb_epoch = 3  # 设定循环次数

autoencoder.compile(optimizer='adam', loss='mean_squared_error', metrics=['accuracy'])  # 编译自动编码器

y_predictor = autoencoder.fit(mat, mat, epochs=nb_epoch, shuffle=True)  # 训练模型

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [35]:
predictions = autoencoder.predict(mat)

In [36]:
pd.DataFrame(predictions).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.822664,0.001279,7.8e-05,1.4e-05,0.005068,0.954477,0.00086,0.119838
1,0.940628,0.975064,0.893712,0.004069,0.606788,0.997374,0.75302,0.864253
2,0.002519,0.01147,0.000932,0.023535,0.00215,0.00431,0.002276,0.029625
3,0.003593,0.002754,0.000172,0.009971,0.001179,0.007643,0.001175,0.020451
4,0.997736,0.001658,5.2e-05,2e-06,0.007299,0.998804,0.00099,0.726779


In [37]:
diff = pd.DataFrame(predictions - mat)

In [38]:
diff.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.286718,0.342055,0.379029,0.409492,-0.380366,0.201508,0.967356,0.127641
1,-3.469372,-14.8555,-6.400728,3.936016,-0.973887,-14.992937,-3.42475,-0.392071
2,0.005431,-0.059018,0.010434,-0.057778,0.014232,0.095405,0.049705,-0.034932
3,-0.031492,0.065248,0.065514,0.007918,0.038057,0.078056,0.108974,0.036412
4,-6.997873,3.549342,2.722659,4.061355,10.12634,-8.132089,10.616967,14.589692


In [39]:
res = diff.copy()

In [40]:
res['Euclidean_Distance'] = diff.apply(lambda series: series ** 2).apply(sum, axis = 1).apply(lambda series: series ** (1/2))

In [41]:
res['BBLE'] = df['BBLE']

In [42]:
res.head()

Unnamed: 0,0,1,2,3,4,5,6,7,Euclidean_Distance,BBLE
0,0.286718,0.342055,0.379029,0.409492,-0.380366,0.201508,0.967356,0.127641,1.283709,1000010101
1,-3.469372,-14.8555,-6.400728,3.936016,-0.973887,-14.992937,-3.42475,-0.392071,22.952194,1000010201
2,0.005431,-0.059018,0.010434,-0.057778,0.014232,0.095405,0.049705,-0.034932,0.141263,1000020001
3,-0.031492,0.065248,0.065514,0.007918,0.038057,0.078056,0.108974,0.036412,0.174201,1000020023
4,-6.997873,3.549342,2.722659,4.061355,10.12634,-8.132089,10.616967,14.589692,24.077547,1000030001


In [43]:
res.sort_values('Euclidean_Distance', ascending = False).head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,Euclidean_Distance,BBLE
632815,-521.107727,354.247559,-749.752441,-273.314819,14.842957,99.41021,-51.93486,-38.979977,1023.813313,4018420001
776305,-136.497467,-620.758545,-22.341806,-488.882416,30.841997,39.819042,372.821472,-214.634766,911.635123,4080100001
565391,-638.984558,-259.374603,373.454315,-62.073997,-37.298744,298.480469,-139.448349,231.629745,884.577705,3085900700
1067359,-75.005608,-414.056091,-223.563736,137.410797,-8.616806,-456.605652,-421.321228,116.663086,803.502075,5078530085
585117,-138.96199,54.695034,135.755951,103.024406,-441.478973,-14.757317,-89.461861,-240.268326,558.748106,4004200001
585438,-120.426598,53.299198,96.776764,96.715477,-401.616211,-127.614517,92.986214,-168.020477,500.521084,4004590005
565397,-285.718597,104.716133,205.314835,133.19165,-33.421089,92.837639,-221.37114,180.560471,493.799394,3085910100
85885,-165.665695,81.685783,62.627495,93.601219,217.058807,-193.003357,212.425369,239.293884,483.272188,1012540010
917941,-100.678413,28.79159,97.204338,51.144653,90.031181,92.794067,-229.52002,-292.68869,422.009502,4142600001
750815,-17.790442,-108.71701,-52.175602,-39.44561,-8.171164,-301.920044,-235.546127,37.95293,405.658746,4066610005E


In [44]:
res.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1070994 entries, 0 to 1070993
Data columns (total 10 columns):
0                     1070994 non-null float32
1                     1070994 non-null float32
2                     1070994 non-null float32
3                     1070994 non-null float32
4                     1070994 non-null float32
5                     1070994 non-null float32
6                     1070994 non-null float32
7                     1070994 non-null float32
Euclidean_Distance    1070994 non-null float64
BBLE                  1070994 non-null object
dtypes: float32(8), float64(1), object(1)
memory usage: 49.0+ MB


In [45]:
res[['Euclidean_Distance', 'BBLE']].to_csv('Data/Auto Encoder.csv')