In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib

import urllib.request # request library for downloading a url
import os.path

In [2]:
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10, 8) # set default figure size, 8in by 6in

# Example Download of UCI Data Set

Here is an example of downloading a file from an internet URL address, then loading it into
a pandas dataframe.

In [3]:
# create a report hook function, so that the urlretrieve() can display
# a status report while downloading
def urlretrieve_reporthook(block_number, read_size, total_file_size):
    if block_number % 100 == 0:
        print("\rReading %d / %d complete" % (read_size * block_number, total_file_size), end="")

In [4]:
# the UCI datasets have been pre-divided into test and training sets
test_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_test_set.csv'
test_file = './data/aps-failure-test-set.csv'
train_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00421/aps_failure_training_set.csv'
train_file = './data/aps-failure-train-set.csv'

# download the training data csv (comma separated values) file into our data folder
# I picked a relatively large dataset file/example here (45 MB), so this may take a bit of time to
# download on a slow connection.
# always good to check and only download if we don't already have the file, so we can more easily
# rerun all cells without causing a long download to be done every time
if not os.path.exists(train_file):
    print('Beginning file download with urllib2...')
    urllib.request.urlretrieve(train_url, train_file, reporthook=urlretrieve_reporthook)

In [5]:
# load the csv file into a pandas dataframe
# the train file we receive has 20 lines of copyright/header information we need to skip over
# also the csv file uses na to represent missing data, which is not interpreted as a missing by
# pandas by default.  By specifying this as a na_values, all of the columnes are interpreted 
# as numberic types and NaN are the numeric values given to the missing data.
train = pd.read_csv(train_file, skiprows=20, na_values=['na'])

In [6]:
# show some information about the data
num_samples, num_features = train.shape
print("Number of features:", num_features)
print("number of training samples:", num_samples)

Number of features: 171
number of training samples: 60000


In [7]:
# If we correctly interpret 'na' as missing data, we get 170 of the 171 columns interpreted as numeric.  
print(train.dtypes)

# The count for describe shows the total present values for each feature out of the 60000 samples for each one.
train.describe()

class      object
aa_000      int64
ab_000    float64
ac_000    float64
ad_000    float64
ae_000    float64
af_000    float64
ag_000    float64
ag_001    float64
ag_002    float64
ag_003    float64
ag_004    float64
ag_005    float64
ag_006    float64
ag_007    float64
ag_008    float64
ag_009    float64
ah_000    float64
ai_000    float64
aj_000    float64
ak_000    float64
al_000    float64
am_0      float64
an_000    float64
ao_000    float64
ap_000    float64
aq_000    float64
ar_000    float64
as_000    float64
at_000    float64
           ...   
dl_000    float64
dm_000    float64
dn_000    float64
do_000    float64
dp_000    float64
dq_000    float64
dr_000    float64
ds_000    float64
dt_000    float64
du_000    float64
dv_000    float64
dx_000    float64
dy_000    float64
dz_000    float64
ea_000    float64
eb_000    float64
ec_00     float64
ed_000    float64
ee_000    float64
ee_001    float64
ee_002    float64
ee_003    float64
ee_004    float64
ee_005    float64
ee_006    

Unnamed: 0,aa_000,ab_000,ac_000,ad_000,ae_000,af_000,ag_000,ag_001,ag_002,ag_003,...,ee_002,ee_003,ee_004,ee_005,ee_006,ee_007,ee_008,ee_009,ef_000,eg_000
count,60000.0,13671.0,56665.0,45139.0,57500.0,57500.0,59329.0,59329.0,59329.0,59329.0,...,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,59329.0,57276.0,57277.0
mean,59336.5,0.713189,356014300.0,190620.6,6.81913,11.006817,221.6364,975.7223,8606.015,88591.28,...,445489.7,211126.4,445734.3,393946.2,333058.2,346271.4,138730.0,8388.915,0.090579,0.212756
std,145430.1,3.478962,794874900.0,40404410.0,161.543373,209.792592,20478.46,34200.53,150322.0,761731.2,...,1155540.0,543318.8,1168314.0,1121044.0,1069160.0,1728056.0,449510.0,47470.43,4.368855,8.830641
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,834.0,0.0,16.0,24.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2936.0,1166.0,2700.0,3584.0,512.0,110.0,0.0,0.0,0.0,0.0
50%,30776.0,0.0,152.0,126.0,0.0,0.0,0.0,0.0,0.0,0.0,...,233796.0,112086.0,221518.0,189988.0,92432.0,41098.0,3812.0,0.0,0.0,0.0
75%,48668.0,0.0,964.0,430.0,0.0,0.0,0.0,0.0,0.0,0.0,...,438396.0,218232.0,466614.0,403222.0,275094.0,167814.0,139724.0,2028.0,0.0,0.0
max,2746564.0,204.0,2130707000.0,8584298000.0,21050.0,20070.0,3376892.0,4109372.0,10552860.0,63402070.0,...,77933930.0,37758390.0,97152380.0,57435240.0,31607810.0,119580100.0,19267400.0,3810078.0,482.0,1146.0


In [8]:
# the first column is actually the label/target we would want to use if we were to build a classifier.
# The values are 'neg' and 'pos'
np.unique(train['class'])

array(['neg', 'pos'], dtype=object)

In [9]:
# we would want to remove this column from the training data, and create a y (training labels), that uses 0 for
# the 'neg' class, and 1 for the 'pos' class, for scikit-learn training.
# slice columns 1 to 171, which are the training features
X_train = train.iloc[:,1:]
print(X_train.shape)

(60000, 170)


In [10]:
# slice column 0 which are the training labels
y_train = train.iloc[:,0]
print(y_train.shape)
print(y_train.dtype)

# make numeric by translating 'neg' to 0 and 'pos' to 1
mapping = {'neg': 0, 'pos': 1}
y_train.replace(mapping, inplace=True)
print(y_train)

(60000,)
object
0        0
1        0
2        0
3        0
4        0
5        0
6        0
7        0
8        0
9        1
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       0
20       0
21       0
22       0
23       1
24       0
25       0
26       0
27       0
28       0
29       0
        ..
59970    0
59971    0
59972    0
59973    0
59974    0
59975    0
59976    0
59977    0
59978    0
59979    0
59980    0
59981    0
59982    0
59983    0
59984    0
59985    0
59986    0
59987    0
59988    0
59989    0
59990    0
59991    0
59992    0
59993    0
59994    0
59995    0
59996    0
59997    0
59998    0
59999    0
Name: class, Length: 60000, dtype: int64
