In [1]:
import h5py
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import animation
import peakutils
import scipy
from scipy import fft
from scipy import signal
from scipy import integrate
from scipy.fftpack import fft
from scipy.fftpack import fftfreq
from scipy import stats
from scipy.stats import kurtosis, skew
from scipy.signal import find_peaks
from sklearn import preprocessing
from sklearn.svm import OneClassSVM
import warnings
import random
import math
from math import pi
import seaborn as sns
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
import warnings
from time import process_time
from matplotlib import cm
warnings.filterwarnings("ignore")
%matplotlib inline
plt.rcParams['agg.path.chunksize'] = 10000
plt.rcParams.update({'font.size': 18})
plt.rcParams.update({'font.family': 'Arial'})

# sampling frequencies of the sensors used for data acquistion
sampling_vibration = 50000
sampling_acoustic = 1000

import functions

# START of the data processing

### Data processing for aircut / milling data classification

In [None]:
# Labeling the vibration data as aircut (0) and milling (1) for trials 2,9,12,18

# reading the vibration data
data = h5py.File(r'M:\THESIS_IPT\MRIDUL\Versuche\vibration_data1\18.h5', 'r')
Vibration_X=data['acq_0_MCI42__AnalogInputs_AI1'][:] / 0.0967              #Coefficients depends on the sensor, converts the digital signal values to a physical unit 
Vibration_Y=data['acq_0_MCI42__AnalogInputs_AI2'][:] / 0.0968
Vibration_Z=data['acq_0_MCI42__AnalogInputs_AI3'][:] / 0.0947

# combining the vibrations in all directions into one list
combined_vibration = []
combined_vibration.append(Vibration_X)
combined_vibration.append(Vibration_Y)
combined_vibration.append(Vibration_Z)

# determining start and end of the milling operations for a milling trial
ind_start, ind_end = functions.milling_ind(combined_vibration[0], int_size = 1000, cutoff = 1)
# generating labels
air_mill_data = functions.air_mill_classify(combined_vibration, ind_start, ind_end)

plt.plot(air_mill_data['aircut0'])

#air_mill_data.to_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\18airmill.csv',index=False)

In [None]:
data=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\18airmill.csv')
data

In [None]:
# Taking a rolling window of size 20 ms (1000 data points)

air_mill_data = pd.DataFrame()

# window size
window_length = 1000
lim = data.shape[0]//window_length
window_half = window_length//2

for i in range(2*lim):
    # overlapping rolling windows
    temp = data.iloc[i*window_half:i*window_half+window_length].copy()
    
    # computing maximum acceleration value in each window
    max_x = temp['vibration_x'].max()
    max_y = temp['vibration_y'].max()
    max_z = temp['vibration_z'].max()
    
    # generating labels 
    # label = 1 if more than 50% data in the window belong to the milling class
    # label = 0 if more than 50% data in the window belong to the aircut class
    num_one = (temp['aircut0']==1).sum()
    if num_one>window_half:
        aircut = 1
    else:
        aircut = 0
        
    data_entries = [[max_x, max_y, max_z, aircut]]
    air_mill_data=air_mill_data.append(data_entries, ignore_index=True)
        

# Dataframe storing maximum acceleration values of the windows along with label (aircut/milling)        
air_mill_data.columns = [['max_x', 'max_y', 'max_z', 'aircut']] 

plt.plot(air_mill_data['aircut'])

#air_mill_data.to_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\taking window\20ms\18airmill.csv',index=False)

In [None]:
# merging datasets to obtain training data

data1=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\taking window\20ms\02airmill.csv')
data2=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\taking window\20ms\09airmill.csv')
data3=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\taking window\20ms\12airmill.csv')
data4=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\air_mill\taking window\20ms\18airmill.csv')

data2 = data2.append(data1, ignore_index=True)
data4 = data4.append(data3, ignore_index=True)
data4 = data4.append(data2, ignore_index=True)

#data4.to_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\model data\airmill.csv',index=False)
data4

### Data processing for surface profile

In [None]:
# manually merging the vibration data with the surface data for all trials

VibationData = h5py.File(r'M:\THESIS_IPT\MRIDUL\Versuche\vibration_data1\06.h5', 'r')
filename_surface1 = r'M:\THESIS_IPT\MRIDUL\surface_data_WZL\layer1\Part06_Line1.txt'
filename_surface2 = r'M:\THESIS_IPT\MRIDUL\surface_data_WZL\layer1\Part06_Line2.txt'
filename_surface3 = r'M:\THESIS_IPT\MRIDUL\surface_data_WZL\layer1\Part06_Line3.txt'
filename_surface4 = r'M:\THESIS_IPT\MRIDUL\surface_data_WZL\layer1\Part06_Line4.txt'
filename_surface5 = r'M:\THESIS_IPT\MRIDUL\surface_data_WZL\layer1\Part06_Line5.txt'

In [None]:
_,surf_profile1 = functions.extract_surface(filename_surface1)
_,surf_profile2 = functions.extract_surface(filename_surface2)
_,surf_profile3 = functions.extract_surface(filename_surface3)
_,surf_profile4 = functions.extract_surface(filename_surface4)
_,surf_profile5 = functions.extract_surface(filename_surface5)

surf_combined = []
surf_combined.append(surf_profile1)
surf_combined.append(surf_profile2)
surf_combined.append(surf_profile3)
surf_combined.append(surf_profile4)
surf_combined.append(surf_profile5)

Vibration_X=VibationData['acq_0_MCI42__AnalogInputs_AI1'][:] / 0.0967 
Vibration_Y=VibationData['acq_0_MCI42__AnalogInputs_AI2'][:] / 0.0968
Vibration_Z=VibationData['acq_0_MCI42__AnalogInputs_AI3'][:] / 0.0947

vibration_combined = []
vibration_combined.append(Vibration_X)
vibration_combined.append(Vibration_Y)
vibration_combined.append(Vibration_Z)

ind_start, ind_end = functions.milling_ind(vibration_combined[0], int_size = 1000, cutoff = 1)

merged_data = functions.merge_vibration_surface(vibration_combined, surf_combined, ind_start, ind_end)

#merged_data.to_csv(r'M:\THESIS_IPT\MRIDUL\merged_surface\06merged1.csv',index=False)
merged_data

In [None]:
xticks = list(np.linspace(0, Vibration_X.shape[0], 11, endpoint = True))  #Creates 11 equidistant ticks in range of Vibration signal (beginning to end)
xlabels = tuple([int(i / 50000) for i in xticks]) 

fig, ax1 = plt.subplots(figsize=(16,4))
ax1.set_xlabel('Time [s]')
ax1.set_ylabel('Acceleration [$m/s^2$]')
ax1.plot(merged_data['vibration_x'], label='Vibration X')

ax2 = ax1.twinx()
ax2.set_ylabel('Surface Profile [mm]')
ax2.plot(merged_data['surface_roughness'], color='orange',label='Surface Profile')

lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax2.legend(lines + lines2, labels + labels2, loc=0, fontsize=16) 

plt.xticks(xticks,xlabels)
plt.title('Vibration X and Surface Profiles')
#plt.savefig(r'M:\THESIS_IPT\REPORT\images\1_6vibx_surface.png',bbox_inches='tight', dpi=1000)
plt.show()

### Adding features for vibration data (for all milling trials at once)

In [None]:
# Adding vibration features (time domain + frequency domain)

# location of files containing vibration data merged with surface data
file_loc = r'M:\THESIS_IPT\MRIDUL\merged_surface'
# location for storing files containing the vibration features and the target variable (mean peak value)
location = r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\merged_vibration_only'

for file in os.listdir(file_loc):
    filename = os.fsdecode(file)
    
    # reading all csv files one by one
    if filename.endswith('.csv'):
        print(os.path.join(file_loc, filename))
        path_ = os.path.join(file_loc, filename)
        merge_data=pd.read_csv(path_)
        features_data = pd.DataFrame()
        
        # window size
        window_length = 1000
        window_half = window_length//2
        lim = merge_data.shape[0]//window_length
        
        # overlapping rolling windows
        for i in range(2*lim):
            temp = merge_data.iloc[i*window_half:i*window_half+window_length].copy()
            temp = temp.reset_index(drop=True)
            
            # adding the computed features
            features = functions.add_features(temp, sampling_vibration, win=window_length-1, segment_length=window_length) # keep window size high and 1000//win should not equal to 0
            features_data=features_data.append(features, ignore_index=True)
        
        features_data = features_data.fillna(0.0001)
        
        # column names for added features
        features_data.columns = [['rms_x', 'rms_y', 'rms_z', 
                                  'kurt_x', 'kurt_y', 'kurt_z',
                                  'skew_x', 'skew_y', 'skew_z',
                                  'mean_x', 'mean_y', 'mean_z',
                                  'std_x', 'std_y', 'std_z',
                                  'peak_x', 'peak_y', 'peak_z',
                                  'crest_x', 'crest_y', 'crest_z',
                                  'clear_x', 'clear_y', 'clear_z',
                                  'shape_x', 'shape_y', 'shape_z',
                                  'impulse_x', 'impulse_y', 'impulse_z',
                                  'msf_x', 'msf_y', 'msf_z',
                                  'osac_x', 'osac_y', 'osac_z',
                                  'fc_x', 'fc_y', 'fc_z',
                                  'sf_x', 'sf_y', 'sf_z',
                                  'avg_peak', 
                                  'frac_dim_x', 'frac_dim_y', 'frac_dim_z',
                                  'fifth_x', 'fifth_y', 'fifth_z',
                                  'sixth_x', 'sixth_y', 'sixth_z',
                                  'freq_x_0_5000', 'freq_x_5000_10000', 'freq_x_10000_15000', 'freq_x_15000_20000', 'freq_x_20000_25000',
                                  'freq_y_0_5000', 'freq_y_5000_10000', 'freq_y_10000_15000', 'freq_y_15000_20000', 'freq_y_20000_25000',
                                  'freq_z_0_5000', 'freq_z_5000_10000', 'freq_z_10000_15000', 'freq_z_15000_20000', 'freq_z_20000_25000']]         
        
        
        # extracting name of the file for storing
        name = os.path.splitext(file)[0]
        
        #features_data.to_csv(location + '//' + name + 'vibration' + '.csv',index=False)

### Adding features for acoustic data (for all milling trials at once)

In [None]:
# Adding acoustic features (frequency domain)

# directories

# location of files containing features of vibration data 
file_loc = r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\merged_vibration_only'
mer_vib_data=[]
for file in os.listdir(file_loc):
    filename = os.fsdecode(file)
    
    if filename.endswith('.csv'):
        path_ = os.path.join(file_loc, filename)
        mer_vib_data.append(path_)

# location of files containing features of sensor data 
file_loc = r'M:\THESIS_IPT\MRIDUL\Versuche\vibration_data_mixed'
sens_data=[]
for file in os.listdir(file_loc):
    filename = os.fsdecode(file)
    
    if filename.endswith('.h5'):
        path_ = os.path.join(file_loc, filename)
        sens_data.append(path_)

# printing name of the directories for verifying the order of files being read
for i in range(len(mer_vib_data)):
    print(mer_vib_data[i])
    print(sens_data[i])
    name = os.path.splitext(mer_vib_data[i])[0]
    
    # extracting name of the file for storing
    print(len(name))
    print(name[84:])    # change this if the directory is changed
    print('\n')

In [None]:
# Adding acoustic features (frequency domain)

# location of files for storing the dataframe containing features of vibration data and acoustic data
location = r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\merged_vibration_ae' 

for j in range(len(mer_vib_data)):
    
    # name of the file containing the vibration features
    print(mer_vib_data[j])
    # name of the file containing sensor data
    print(sens_data[j])
    print('\n')
    
    merge_features_data=pd.read_csv(mer_vib_data[j])
    
    # reading the acoustic data
    data = h5py.File(sens_data[j], 'r')
    AE_temp=data['acq_1_MCI42__AnalogInputsHS_RawHS1[0]'][:]
    
    features_data = pd.DataFrame()

    # frequency is recorded from 0 kHz to 500 kHz with a uniform interval of 5 kHz
    freq = np.arange(0,500,5)

    # window length
    # sampling rate of AE sensor = 1000 Hz. In 20 ms, it with record 20 STFT windows (i.e 20*100 = 2000 datapoints) 
    window_length = 2*1000
    window_half = window_length//2
    lim = AE_temp.shape[0]//window_length
    
    # overlapping rolling window
    for i in range(2*lim):
        temp = AE_temp[i*window_half:i*window_half+window_length]
        
        # computing STFT of the acoustic data
        fft_ae = functions.fftAE(signal = temp, sample_rate = sampling_acoustic)
        # computing mean of amplitudes of 20 STFT windows
        mean_fft_ae = np.mean(fft_ae, axis=0)
        mag = mean_fft_ae
        
        # adding generated features
        features = functions.add_AEfeatures(freq,mag)
        features_data=features_data.append(features, ignore_index=True)
    
    # column names for the added features
    features_data.columns = ['AE_msf','AE_osac', 'AE_fc', 'AE_sf',
                             'freqAE_0_100', 'freqAE_100_200', 'freqAE_200_300', 'freqAE_300_400', 'freqAE_400_500']
    
    # merging the dataframe of vibration features with the dataframe with acoustic features
    new = pd.concat([merge_features_data, features_data], axis=1)
    
    # extracting name for storing the file
    temp = os.path.splitext(mer_vib_data[j])[0]
    name = temp[84:]
    
    #new.to_csv(location + '//' + name + 'AE.csv',index=False)

### Combining all files containing the features from the milling trials into one file

In [None]:
data_all = pd.DataFrame()

# location of files for storing the dataframe containing features of vibration data and acoustic data
file_loc = 'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\merged_vibration_ae'
for file in os.listdir(file_loc):
    filename = os.fsdecode(file)
    
    # reading all csv files
    if filename.endswith('.csv'):
        loc = os.path.join(file_loc, filename)
        temp = pd.read_csv(str(loc))
        # merging all files into one (adding dataframes one below another)
        data_all = data_all.append(temp, ignore_index=True)

#data_all.to_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\combined_merged_featuresALL.csv',index=False)
data_all

### Removing undesired data

In [None]:
combined_data=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\combined_merged_featuresALL.csv')
plt.figure(figsize=(16,4))
plt.plot(combined_data['avg_peak'])
plt.show()

# dropping the start and end of the milling data that are characterised by large magnitudes of average peak value
combined_data.drop(combined_data[combined_data['avg_peak'] > 0.1].index, inplace=True)

plt.figure(figsize=(16,4))
plt.plot(combined_data['avg_peak'])
plt.show()

In [None]:
# dropping random air data to have balanced amount of aircut and milling data for training

air_index = []
# selecting aircut data based on the value of target variable
col_num = combined_data.columns.get_loc("avg_peak")

# storing the indices of the aircut data
for i in range(combined_data.shape[0]):
    if combined_data.iloc[i,col_num]==0.0001:
        air_index.append(i)
        
print('number of aircut datapoints:',len(air_index))

mill_points = combined_data.shape[0] - len(air_index)
print('number of milling datapoints:',mill_points)

diff_points = len(air_index) - mill_points
points_to_drop = mill_points + diff_points//2
print('number of data points to be dropped from the aircut data:',points_to_drop)

# randomly selecting the indices of aircut data for deletion
rand_air_index = random.sample(air_index,points_to_drop)

# dropping the aircut data to reduce its influence while training the ML model
combined_data.drop(combined_data.index[rand_air_index], axis=0, inplace=True)

# storing the modified dataset
#combined_data.to_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\Combined_final_ALL.csv',index=False)

### Outliers removal

In [None]:
combined_data=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\Combined_final_ALL.csv')

plt.figure(figsize = (16, 4))
plt.plot(combined_data['rms_x'])
plt.xlabel('Data Points')
plt.ylabel('Acceleration [$m/s^2$]')
plt.title('Root Mean Square values BEFORE Outliers Removal')
#plt.savefig(r'M:\THESIS_IPT\REPORT\images\outlierBEFORE_1_6_RMS.png',bbox_inches='tight',dpi=1000)
plt.show()

p, (ax1, ax2) = plt.subplots(1,2,figsize=(16,9))
p.suptitle("Data of all Milling Trials Combined: Before and After Outliers Removal")

combined_data.boxplot('rms_x', ax=ax1)
ax1.set_xlabel('Before Outliers Removal')
ax1.set_ylim(-1,15)
ax1.set_ylabel('Acceleration [$m/s^2$]')

Q1 = combined_data['rms_x'].quantile(0.25)
Q3 = combined_data['rms_x'].quantile(0.75)
IQR = Q3 - Q1
outlier_range1 = Q3 + 1.5*IQR
outlier_range2 = Q1 - 1.5*IQR
combined_data.drop(combined_data[combined_data['rms_x'] > outlier_range1].index, inplace=True)
combined_data.drop(combined_data[combined_data['rms_x'] < outlier_range2].index, inplace=True)

combined_data.boxplot('rms_x', ax=ax2)
ax2.set_xlabel('After Outliers Removal')
ax2.set_ylim(-1,15)
ax2.set_ylabel('Acceleration [$m/s^2$]')
#plt.savefig(r'M:\THESIS_IPT\REPORT\images\outliers_1_6_RMS.png',bbox_inches='tight',dpi=1000)
plt.show()



plt.figure(figsize = (16, 4))
plt.plot(combined_data['rms_x'], label='RMS values')
plt.xlabel('Data Points')
plt.ylabel('Acceleration [$m/s^2$]')
plt.title('Root Mean Square values AFTER Outliers Removal')
plt.legend()
#plt.savefig(r'M:\THESIS_IPT\REPORT\images\outlierAFTER_1_6_RMS.png',bbox_inches='tight',dpi=1000)
plt.show()



combined_data=pd.read_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\Combined_final_ALL.csv')
combined_data.drop(combined_data[combined_data['rms_x'] > 10].index, inplace=True)

plt.figure(figsize = (16, 4))
plt.plot(combined_data['rms_x'])
plt.xlabel('Data Points')
plt.ylabel('Acceleration [$m/s^2$]')
plt.title('Root Mean Square values AFTER Outliers Removal')
#plt.savefig(r'M:\THESIS_IPT\REPORT\images\outlierAFTER2.0_1_6_RMS.png',bbox_inches='tight',dpi=1000)
plt.show()

In [None]:
# plots for all features for visual identification of outliers
for i in range(combined_data.shape[1]):
    print(combined_data.columns[i])
    plt.plot(combined_data.iloc[:,i])
    plt.show()

In [None]:
# removing outlier: dropping the data points based on their values

combined_data.drop(combined_data[combined_data['rms_x'] > 10].index, inplace=True)
combined_data.drop(combined_data[combined_data['rms_y'] > 20].index, inplace=True)
combined_data.drop(combined_data[combined_data['kurt_x'] > 75000].index, inplace=True)
combined_data.drop(combined_data[combined_data['kurt_y'] > 40000].index, inplace=True)
combined_data.drop(combined_data[combined_data['kurt_z'] > 50000].index, inplace=True)
combined_data.drop(combined_data[combined_data['skew_x'] > 2500].index, inplace=True)
combined_data.drop(combined_data[combined_data['skew_x'] < -2500].index, inplace=True)
combined_data.drop(combined_data[combined_data['skew_y'] < -1100].index, inplace=True)
combined_data.drop(combined_data[combined_data['skew_z'] < -2000].index, inplace=True)
combined_data.drop(combined_data[combined_data['mean_x'] > 1].index, inplace=True)
combined_data.drop(combined_data[combined_data['mean_x'] < -1].index, inplace=True)
combined_data.drop(combined_data[combined_data['mean_y'] > 1].index, inplace=True)
combined_data.drop(combined_data[combined_data['mean_y'] < -1].index, inplace=True)
combined_data.drop(combined_data[combined_data['crest_x'] > 13].index, inplace=True)
combined_data.drop(combined_data[combined_data['crest_y'] > 12].index, inplace=True)
combined_data.drop(combined_data[combined_data['crest_z'] > 15].index, inplace=True)
#combined_data.drop(combined_data[combined_data['clear_x'] > 15].index, inplace=True)
#combined_data.drop(combined_data[combined_data['clear_y'] > 15].index, inplace=True)
combined_data.drop(combined_data[combined_data['clear_z'] > 20].index, inplace=True)
#combined_data.drop(combined_data[combined_data['impulse_x'] > 20].index, inplace=True)
combined_data.drop(combined_data[combined_data['impulse_y'] > 20].index, inplace=True)
combined_data.drop(combined_data[combined_data['impulse_z'] > 20].index, inplace=True)
combined_data.drop(combined_data[combined_data['fifth_x'] < -1000000].index, inplace=True)
combined_data.drop(combined_data[combined_data['fifth_x'] > 1000000].index, inplace=True)
combined_data.drop(combined_data[combined_data['fifth_y'] < -400000].index, inplace=True)
combined_data.drop(combined_data[combined_data['fifth_z'] < -500000].index, inplace=True)
combined_data.drop(combined_data[combined_data['sixth_x'] > 5000000].index, inplace=True)
combined_data.drop(combined_data[combined_data['sixth_y'] > 2000000].index, inplace=True)
combined_data.drop(combined_data[combined_data['sixth_z'] > 10000000].index, inplace=True)


#combined_data.to_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\emre_stft_vib\Combined_final_ALL_noOutliers.csv',index=False)

### Tranforming the dataset to classification data (with multiple classes)

In [8]:
combined_data = pd.read_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\Combined_final_ALL_noOutliers.csv')

# number of classes
num_classes = 10
final_data = functions.class_transform(num_classes, combined_data)

#final_data.to_csv(r'M:\THESIS_IPT\MRIDUL\data_overlapping_windows\points1000_20ms\Combined_final_ALL_noOutliers0_10.csv',index=False)
