In [707]:
# Import the necessary python libraries
import numpy as np
import pandas as pd
from io import StringIO
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# Import the dataset from the project directory
dataset = pd.read_csv('F:\Project\Final Year\Main Work\Data\Data.csv')
# Drop the first column (i.e., the No column)
dataset = dataset.drop(['No'], axis = 1)
# View the dataset
dataset

Unnamed: 0,A6,D1,D2,D3,D4,D5,D6,Fault
0,26408.713,0.000005,0.000090,0.000873,0.002784,0.019874,0.500730,LG
1,29662.064,0.000971,0.001747,0.029366,0.302926,1.828292,11.875256,NF
2,27620.752,0.000777,0.001352,0.022467,0.233044,1.433539,8.511706,NF
3,31733.711,0.000072,0.000704,0.011751,0.138808,0.482206,3.691056,L
4,30996.776,0.000554,0.000693,0.011970,0.125106,0.763365,5.912267,L
5,25299.768,0.000798,0.001417,0.023527,0.243635,1.490000,8.828671,NF
6,25256.058,0.000002,0.000005,0.000074,0.000886,0.019878,0.500691,LG
7,25251.827,0.000398,0.000825,0.009568,0.090224,1.015024,10.100867,LG
8,25685.338,0.000786,0.001381,0.022934,0.237643,1.456252,8.634304,NF
9,25256.057,0.000002,0.000005,0.000073,0.000886,0.019878,0.500691,LG


In [708]:
# HANDLING CATEGORICAL DATA
# Output column is a column with categoricaldata. ML algorithms do not work with 
# any datatype aside numerical data.Therefore LabelEncoding will be used
# Performing LabelEncoding using sciketlearn
labelencoder = LabelEncoder()
dataset['Fault'] = labelencoder.fit_transform(dataset['Fault'])
dataset

Unnamed: 0,A6,D1,D2,D3,D4,D5,D6,Fault
0,26408.713,0.000005,0.000090,0.000873,0.002784,0.019874,0.500730,2
1,29662.064,0.000971,0.001747,0.029366,0.302926,1.828292,11.875256,4
2,27620.752,0.000777,0.001352,0.022467,0.233044,1.433539,8.511706,4
3,31733.711,0.000072,0.000704,0.011751,0.138808,0.482206,3.691056,1
4,30996.776,0.000554,0.000693,0.011970,0.125106,0.763365,5.912267,1
5,25299.768,0.000798,0.001417,0.023527,0.243635,1.490000,8.828671,4
6,25256.058,0.000002,0.000005,0.000074,0.000886,0.019878,0.500691,2
7,25251.827,0.000398,0.000825,0.009568,0.090224,1.015024,10.100867,2
8,25685.338,0.000786,0.001381,0.022934,0.237643,1.456252,8.634304,4
9,25256.057,0.000002,0.000005,0.000073,0.000886,0.019878,0.500691,2


In [709]:
# DEALING WITH MISSING DATA
# All the data values should be positive non-zero numbers. 0 numbers are considered as missing data
# Therefore, marking all missing data with NaN from the numpy library
dataset[['A6', 'D1', 'D2','D3','D4','D5','D6']] = \
dataset[['A6', 'D1', 'D2','D3','D4','D5','D6']].replace(0, np.nan)
# Impute all missing data (i.e., all NaN data) with the 'mean' strategy using pandas
imp_mean = SimpleImputer(missing_values = np.nan, strategy='mean')
imp_mean.fit(dataset)
imp_mean.transform(dataset.values)
# Checking if all NaN values are replaced else eliminate them
dataset.isnull().sum()
# eliminating rows that have at least 2 NaN values
dataset.dropna(thresh = 2)
# eliminating columns where all values as NaN
dataset.dropna(axis=1, how='all')

Unnamed: 0,A6,D1,D2,D3,D4,D5,D6,Fault
0,26408.713,0.000005,0.000090,0.000873,0.002784,0.019874,0.500730,2
1,29662.064,0.000971,0.001747,0.029366,0.302926,1.828292,11.875256,4
2,27620.752,0.000777,0.001352,0.022467,0.233044,1.433539,8.511706,4
3,31733.711,0.000072,0.000704,0.011751,0.138808,0.482206,3.691056,1
4,30996.776,0.000554,0.000693,0.011970,0.125106,0.763365,5.912267,1
5,25299.768,0.000798,0.001417,0.023527,0.243635,1.490000,8.828671,4
6,25256.058,0.000002,0.000005,0.000074,0.000886,0.019878,0.500691,2
7,25251.827,0.000398,0.000825,0.009568,0.090224,1.015024,10.100867,2
8,25685.338,0.000786,0.001381,0.022934,0.237643,1.456252,8.634304,4
9,25256.057,0.000002,0.000005,0.000073,0.000886,0.019878,0.500691,2


In [710]:
# Standardising the inputs of the dataset (between -1 and 1)
# Define target
target_y = dataset['Fault']
# Drop the output from the dataset
dataset = dataset.drop(['Fault'], axis = 1)
# Standardise the inputs by creating a Scaler object. 
scaler = preprocessing.StandardScaler()
# Fit your data on the scaler object
scaled_df = scaler.fit_transform(dataset)
# Define inputs
dataset = pd.DataFrame(scaled_df, columns = ['A6','D1','D2','D3','D4','D5','D6'])
# Add the output to the dataset
dataset = pd.concat([dataset, target_y], axis = 1)
# Save the dataset 
dataset.to_excel(r'F:\Project\Final Year\Main Work\Data\Proccessed_Data.xlsx', sheet_name='ProcessedData')
dataset

Unnamed: 0,A6,D1,D2,D3,D4,D5,D6,Fault
0,-1.455089,-0.838222,-0.513463,-0.387204,-0.254217,-0.276760,-0.577959,2
1,-1.426091,-0.502916,-0.442388,-0.332265,-0.223730,-0.241703,-0.445879,4
2,-1.444286,-0.570312,-0.459349,-0.345568,-0.230829,-0.249355,-0.484936,4
3,-1.407626,-0.815078,-0.487150,-0.366229,-0.240400,-0.267797,-0.540913,1
4,-1.414194,-0.647863,-0.487585,-0.365807,-0.241792,-0.262347,-0.515121,1
5,-1.464973,-0.563033,-0.456572,-0.343522,-0.229753,-0.248261,-0.481256,4
6,-1.465363,-0.839246,-0.517084,-0.388746,-0.254410,-0.276760,-0.577960,2
7,-1.465401,-0.701915,-0.481951,-0.370438,-0.245335,-0.257468,-0.466483,2
8,-1.461537,-0.567077,-0.458115,-0.344668,-0.230361,-0.248915,-0.483513,4
9,-1.465363,-0.839249,-0.517086,-0.388746,-0.254410,-0.276760,-0.577960,2


In [711]:
# Splitting dataset into training and testing datasets
x_train, y_test = train_test_split(dataset, test_size=0.2)
(train_data, test_data) = (x_train, y_test)
x, y = train_test_split(x_train, test_size=0.001)
check_data = y.drop(['Fault'], axis = 1)
print(check_data)

          A6       D1        D2       D3        D4        D5        D6
65 -0.071251 -0.12111 -0.329926 -0.29466 -0.208723 -0.173515 -0.011399


In [712]:
# Saving the training and testing datasets
# Training data
train_data.to_csv(r'F:\Project\Final Year\Main Work\Data\Train_Data.csv', header = False, index=False) 
train_data.to_excel(r'F:\Project\Final Year\Main Work\Data\Train_Data.xlsx', sheet_name='Training_Data')
# Testing data
test_data.to_csv(r'F:\Project\Final Year\Main Work\Data\Test_Data.csv', header = False, index=False) 
test_data.to_excel(r'F:\Project\Final Year\Main Work\Data\Test_Data.xlsx', sheet_name='Testing_Data')
# Checking data
check_data.to_csv(r'F:\Project\Final Year\Main Work\Data\Check_Data.csv', header = False, index=False) 
check_data.to_excel(r'F:\Project\Final Year\Main Work\Data\Check_Data.xlsx', sheet_name='Testing_Data')