# Milestone 01
# Peter Lorenz

## 0. Preliminaries

Import the required libraries:

In [2]:
import numpy as np
import pandas as pd

import matplotlib as mpl

Set global options:

In [3]:
# Display plots inline
%matplotlib inline

# Display multiple cell outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Suppress scientific notation
np.set_printoptions(suppress=True)
np.set_printoptions(precision=3)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

Declare utility functions:

## 1. Refine data understanding / Prepare DFD of solving the manufacturing quality problem
In this section we refine our understanding of the data and prepare a dataflow diagram that describes a potential solution to the manufacturing quality problem.

## 2. Read and merge data
First we import the sensor data set and its accompanying labels, generating column names as necessary for convenient reference to specific features:

In [15]:
# Internet location of the data set and labels
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom.data"
labels_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/secom/secom_labels.data"

# Download sensor data and labels into a dataframe object, specify python engine for regex
sensor_data = pd.read_csv(url, sep='\s{1,}', engine='python')
sensor_labels_data = pd.read_csv(labels_url, sep='\s{1,}', engine='python')

# Generate index-based column names for the sensor data set
sensor_data.columns = list('s' + str(idx + 1) for idx in range(0, sensor_data.shape[1]))

# Assign column names to the labels
sensor_labels_data.columns = ['result', 'date', 'time']

# Display shape and initial values of the sensor data set
print('Sensor data set:')
sensor_data.shape
sensor_data.head()

# Display shape and data for labels
print('Sensor labels:')
sensor_labels_data.shape
sensor_labels_data.head()

Data set:


(1566, 590)

Unnamed: 0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,s581,s582,s583,s584,s585,s586,s587,s588,s589,s590
0,3095.78,2465.14,2230.422,1463.661,0.829,100.0,102.343,0.125,1.497,-0.001,...,0.006,208.204,0.502,0.022,0.005,4.445,0.01,0.02,0.006,208.204
1,2932.61,2559.94,2186.411,1698.017,1.51,100.0,95.488,0.124,1.444,0.004,...,0.015,82.86,0.496,0.016,0.004,3.175,0.058,0.048,0.015,82.86
2,2988.72,2479.9,2199.033,909.793,1.32,100.0,104.237,0.122,1.488,-0.012,...,0.004,73.843,0.499,0.01,0.003,2.054,0.02,0.015,0.004,73.843
3,3032.24,2502.87,2233.367,1326.52,1.533,100.0,100.397,0.123,1.503,-0.003,...,,,0.48,0.477,0.104,99.303,0.02,0.015,0.004,73.843
4,2946.25,2432.84,2233.367,1326.52,1.533,100.0,100.397,0.123,1.529,0.017,...,0.005,44.008,0.495,0.019,0.004,3.828,0.034,0.015,0.005,44.008


Data set labels:


(1566, 3)

Unnamed: 0,result,date,time
0,-1,"""19/07/2008","12:32:00"""
1,1,"""19/07/2008","13:17:00"""
2,-1,"""19/07/2008","14:43:00"""
3,-1,"""19/07/2008","15:22:00"""
4,-1,"""19/07/2008","17:53:00"""


## 3. Clean and prepare data
We begin by examining the data types:

In [17]:
# List columns with data types
sensor_data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1566 entries, 0 to 1565
Data columns (total 590 columns):
 #   Column  Dtype  
---  ------  -----  
 0   s1      float64
 1   s2      float64
 2   s3      float64
 3   s4      float64
 4   s5      float64
 5   s6      float64
 6   s7      float64
 7   s8      float64
 8   s9      float64
 9   s10     float64
 10  s11     float64
 11  s12     float64
 12  s13     float64
 13  s14     float64
 14  s15     float64
 15  s16     float64
 16  s17     float64
 17  s18     float64
 18  s19     float64
 19  s20     float64
 20  s21     float64
 21  s22     float64
 22  s23     float64
 23  s24     float64
 24  s25     float64
 25  s26     float64
 26  s27     float64
 27  s28     float64
 28  s29     float64
 29  s30     float64
 30  s31     float64
 31  s32     float64
 32  s33     float64
 33  s34     float64
 34  s35     float64
 35  s36     float64
 36  s37     float64
 37  s38     float64
 38  s39     float64
 39  s40     float64
 40  s41  

All of the columns are float64, so there are no categorical columns to one-hot encode and no casting is needed. Now we look for missing data by finding the number of columns containing at least one NaN:

In [20]:
# Display number of columns with NaN in sensor data set
print("Number of columns with NaN:", len(sensor_data.columns[sensor_data.isna().any()]))

534

It appears that 534 columns have missing data that must be dealt with. We now impute and replace the missing values using the median value of the respective columns:

In [22]:
# Impute and replace missing values using column median
sensor_data = sensor_data.replace('?', np.NaN).apply(lambda x: x.fillna(x.median()))

# Verify imputation of missing values
print("Number of columns with NaN:", len(sensor_data.columns[sensor_data.isna().any()]))
sensor_data

Number of columns with NaN: 0


Unnamed: 0,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,...,s581,s582,s583,s584,s585,s586,s587,s588,s589,s590
0,3095.780,2465.140,2230.422,1463.661,0.829,100.000,102.343,0.125,1.497,-0.001,...,0.006,208.204,0.502,0.022,0.005,4.445,0.010,0.020,0.006,208.204
1,2932.610,2559.940,2186.411,1698.017,1.510,100.000,95.488,0.124,1.444,0.004,...,0.015,82.860,0.496,0.016,0.004,3.175,0.058,0.048,0.015,82.860
2,2988.720,2479.900,2199.033,909.793,1.320,100.000,104.237,0.122,1.488,-0.012,...,0.004,73.843,0.499,0.010,0.003,2.054,0.020,0.015,0.004,73.843
3,3032.240,2502.870,2233.367,1326.520,1.533,100.000,100.397,0.123,1.503,-0.003,...,0.005,72.289,0.480,0.477,0.104,99.303,0.020,0.015,0.004,73.843
4,2946.250,2432.840,2233.367,1326.520,1.533,100.000,100.397,0.123,1.529,0.017,...,0.005,44.008,0.495,0.019,0.004,3.828,0.034,0.015,0.005,44.008
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1561,2899.410,2464.360,2179.733,3085.378,1.484,100.000,82.247,0.125,1.342,-0.004,...,0.005,203.172,0.499,0.014,0.004,2.867,0.007,0.014,0.005,203.172
1562,3052.310,2522.550,2198.567,1124.659,0.876,100.000,98.469,0.120,1.433,-0.006,...,0.005,72.289,0.497,0.013,0.004,2.624,0.007,0.014,0.005,203.172
1563,2978.810,2379.780,2206.300,1110.497,0.824,100.000,99.412,0.121,1.462,-0.001,...,0.003,43.523,0.499,0.015,0.004,3.059,0.020,0.009,0.003,43.523
1564,2894.920,2532.010,2177.033,1183.729,1.573,100.000,98.798,0.121,1.462,-0.007,...,0.007,93.494,0.500,0.018,0.004,3.566,0.026,0.025,0.007,93.494


Our data set is now free of missing values. As a sanity check, we verify that the labels data is free of missing values by displaying the number of columns with missing values:

In [23]:
# Display number of columns with NaN in sensor labels
print("Number of columns with NaN:", 
      len(sensor_labels_data.columns[sensor_labels_data.isna().any()]))

Number of columns with NaN: 0


Since neither the main data set nor the labels is left with missing values, we can now proceed to data exploration.

## 4. Explore data visually
Next we explore the data visually to gain insight into how to approach creating a model.

## 5. Handle class imbalance problem
Before proceeding we need to address the class imbalance problem inherent in the data set.

## 6. Apply feature selection techniques to reduce dimensionality of data
Here we apply feature selection techniques to reduce the dimensionality of data.

## Conclusions
In lieu of final conclusions we discuss what we have accomplished thus far and the rationale for the various steps we have implemented in preparation for creating a model.