In [1]:
# Pandas and NumPy
import numpy as np
import pandas as pd
# MySQL
import sqlalchemy

## Functions 

In [2]:
def parse_cols(col):
    try:
        return 'v%d' % (int(col) + 1)
    except:
        return col

parser = np.vectorize(parse_cols)

In [3]:
def row_completeness(df):
    return 1 - (df.isnull().sum(axis=1)/len(df.columns)).sum() / len(df.index)

In [4]:
def column_completeness(df):
    return 1 - float(len(df.columns[df.isnull().any()])) / float(len(df.columns))

In [5]:
def columns_with_nan(df):
    return len(df.columns) - len(df.columns[ df.isnull().sum() / len( df.columns ) == 0 ])

## Connect to DB 

In [6]:
con = sqlalchemy.create_engine("mysql+mysqldb://root:"'1234'"@localhost/secom")

## Load data from csv 

### Data 

In [7]:
df_data = pd.read_csv('./data/secom.data', sep=' ', header=None)
df_data.columns = parser(df_data.columns.values)
df_data.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v581,v582,v583,v584,v585,v586,v587,v588,v589,v590
0,3030.93,2564.0,2187.7333,1411.1265,1.3602,100.0,97.6133,0.1242,1.5005,0.0162,...,,,0.5005,0.0118,0.0035,2.363,,,,
1,3095.78,2465.14,2230.4222,1463.6606,0.8294,100.0,102.3433,0.1247,1.4966,-0.0005,...,0.006,208.2045,0.5019,0.0223,0.0055,4.4447,0.0096,0.0201,0.006,208.2045
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,0.0148,82.8602,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,0.0044,73.8432,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
4,3032.24,2502.87,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5031,-0.0031,...,,,0.48,0.4766,0.1045,99.3032,0.0202,0.0149,0.0044,73.8432


### Labels 

In [8]:
df_labels = pd.read_csv('./data/labels.csv', header=None, parse_dates=[1])
df_labels.columns = [ 'results', 'timestamp' ]
df_labels['id'] = range(0, len(df_labels))
df_labels.head()

Unnamed: 0,results,timestamp,id
0,-1,2008-07-19 11:55:00,0
1,-1,2008-07-19 12:32:00,1
2,1,2008-07-19 13:17:00,2
3,-1,2008-07-19 14:43:00,3
4,-1,2008-07-19 15:22:00,4


### Check for completeness of dataset

In [9]:
print('Number of records: %d' % (len(df_data.index)))
print('Row Missing Values: %f' % (1 - row_completeness(df_data)))
print('Columns containg missing values: %d out of %d' % (columns_with_nan(df_data), len(df_data.columns)))
print('Column Missing Values: %f' % (1 - column_completeness(df_data)))

Number of records: 1567
Row Missing Values: 0.045375
Columns containg missing values: 538 out of 590
Column Missing Values: 0.911864


# Data Cleansing 

## 1. Drops examples (rows) with more than 6% of missing values (NaN)

In [10]:
df_complete_examples = df_data[ df_data.isnull().sum(axis=1)/len(df_data.columns) <= 0.06 ]

### Check for completeness of dataset

In [11]:
print('Number of records: %d' % (len(df_complete_examples.index)))
print('Row Missing Values: %f' % (1 - row_completeness(df_complete_examples)))
print('Columns containg missing values: %d out of %d' % (columns_with_nan(df_complete_examples), len(df_complete_examples.columns)))
print('Column Missing Values: %f' % (1 - column_completeness(df_complete_examples)))

Number of records: 1239
Row Missing Values: 0.037135
Columns containg missing values: 114 out of 590
Column Missing Values: 0.193220


## 2. Drops features (columns) with more that 10 missing values 

In [12]:
df_complete_features = df_complete_examples[df_complete_examples.columns[ df_complete_examples.isnull().sum() <= 10 ]]
df_complete_features = df_complete_features[df_complete_features.isnull().sum(axis=1) == 0]

### Check for completeness of dataset

In [13]:
print('Number of records: %d' % (len(df_complete_features.index)))
print('Row Missing Values: %f' % (1 - row_completeness(df_complete_features)))
print('Columns containg missing values: %d out of %d' % (columns_with_nan(df_complete_features), len(df_complete_features.columns)))
print('Column Missing Values: %f' % (1 - column_completeness(df_complete_features)))

Number of records: 1209
Row Missing Values: 0.000000
Columns containg missing values: 0 out of 523
Column Missing Values: 0.000000


## Assigns cleansed dataframe

In [14]:
df_cleansed = df_complete_features
df_cleansed.head()

Unnamed: 0,v1,v2,v3,v4,v5,v6,v7,v8,v9,v10,...,v577,v578,v583,v584,v585,v586,v587,v588,v589,v590
2,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,1.4436,0.0041,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,1.4882,-0.0124,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
5,2946.25,2432.84,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,1.5287,0.0167,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.0342,0.0151,0.0052,44.0077
7,3058.88,2690.15,2248.9,1004.4692,0.7884,100.0,106.24,0.1185,1.5153,0.0157,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.0204,0.0194,0.0063,95.031
8,2967.68,2600.47,2248.9,1004.4692,0.7884,100.0,106.24,0.1185,1.5358,0.0111,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.0111,0.0124,0.0045,111.6525


# Load to database

In [15]:
df_table_secom = pd.merge(df_labels, df_cleansed, how='inner', left_index=True, right_index=True)
df_table_secom = df_table_secom.set_index('id')
df_table_secom.head()

Unnamed: 0_level_0,results,timestamp,v1,v2,v3,v4,v5,v6,v7,v8,...,v577,v578,v583,v584,v585,v586,v587,v588,v589,v590
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2,1,2008-07-19 13:17:00,2932.61,2559.94,2186.4111,1698.0172,1.5102,100.0,95.4878,0.1241,...,2.0952,9.2721,0.4958,0.0157,0.0039,3.1745,0.0584,0.0484,0.0148,82.8602
3,-1,2008-07-19 14:43:00,2988.72,2479.9,2199.0333,909.7926,1.3204,100.0,104.2367,0.1217,...,1.7585,8.5831,0.499,0.0103,0.0025,2.0544,0.0202,0.0149,0.0044,73.8432
5,-1,2008-07-19 17:53:00,2946.25,2432.84,2233.3667,1326.52,1.5334,100.0,100.3967,0.1235,...,1.6679,13.7755,0.4949,0.0189,0.0044,3.8276,0.0342,0.0151,0.0052,44.0077
7,-1,2008-07-19 19:45:00,3058.88,2690.15,2248.9,1004.4692,0.7884,100.0,106.24,0.1185,...,56.4274,16.0862,0.4984,0.0106,0.0034,2.1261,0.0204,0.0194,0.0063,95.031
8,-1,2008-07-19 20:24:00,2967.68,2600.47,2248.9,1004.4692,0.7884,100.0,106.24,0.1185,...,1.3248,14.2892,0.4993,0.0172,0.0046,3.4456,0.0111,0.0124,0.0045,111.6525


In [16]:
df_table_secom.to_sql(con=con, name='table_secom', if_exists='replace')