In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import sys
from io import StringIO

In [2]:
df_data = pd.read_csv('./data/secom.data', sep=' ', header=None)
df_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,580,581,582,583,584,585,586,587,588,589
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


In [16]:
def row_completeness(df):
    return (df.isnull().sum()/len(df.columns)).sum() / len(df.index)

In [17]:
def column_completeness(df):
    return float(len(df.columns[df.isnull().any()])) / float(len(df.columns))

In [19]:
def columns_with_nan(df):
    return len(df.columns) - len(df.columns[ df.isnull().sum() / len( df.columns ) == 0 ])

# Data Cleansing 

### Check for completeness of dataset

In [39]:
print('Number of records: %d' % (len(df_data.index)))
print('Row Completeness: %f' % (row_completeness(df_data)))
print('Columns containg missing values: %d' % (columns_with_nan(df_data)))
print('Row Completeness: %f' % (column_completeness(df_data)))

Number of records: 1567
Row Completeness: 0.045375
Columns containg missing values: 538
Row Completeness: 0.911864


## 1. Drops examples (rows) with more than 6% of missing values (NaN)

In [40]:
df_complete_examples = df_data[ df_data.isnull().sum(axis=1)/len(df_data.columns) <= 0.06 ]

### Check for completeness of dataset

#### Rows 

In [30]:
print('Number of records: %d' % (len(df_complete_examples.index)))
print('Completeness: %f' % (row_completeness(df_complete_examples)))

Number of records: 1239
Completeness: 0.037135


#### Columns 

In [31]:
print('Columns containg missing values: %d' % (columns_with_nan(df_complete_examples)))
print('Completeness: %f' % (column_completeness(df_complete_examples)))

Columns containg missing values: 114
Completeness: 0.193220


## 2. Drops features (columns) with more that 10 missing values 

In [32]:
df_complete_features = df_complete_examples[df_complete_examples.columns[ df_complete_examples.isnull().sum() > 10 ]]

### Check for completeness of dataset

#### Rows 

In [36]:
print('Number of records: %d' % (len(df_complete_features)))
print('Completeness: %f' % (row_completeness(df_complete_features)))

Number of records: 1239
Completeness: 0.325383


#### Columns 

In [37]:
print('Number of records: %d' % (columns_with_nan(df_complete_features)))
print('Completeness: %f' % (column_completeness(df_complete_features)))

Number of records: 67
Completeness: 1.000000
