In [1]:
# Descriptive analysis of handwritten ZIP digits 
# I will be using a dataset provided with “The Elements of Statistical Learning: 
# Data Mining, Inference, and Prediction” by Trevor Hastie, Robert Tibshirani & Jerome Friedman 
# 
# https://web.stanford.edu/~hastie/ElemStatLearn/datasets/zip.info.txt
# https://web.stanford.edu/~hastie/ElemStatLearn/datasets/zip.test.gz
# https://web.stanford.edu/~hastie/ElemStatLearn/datasets/zip.train.gz
#

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
# Read CSV file into a pandas DataFrame 
# Does *not* use first line as a header, recognizes white space character as a field separator 
# see https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html 
data = pd.read_csv("data/zip.train", header=None, sep=" ")

In [4]:
# Display the first 10 lines of data to get a first impression 
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,248,249,250,251,252,253,254,255,256,257
0,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.631,0.862,...,0.823,1.0,0.482,-0.474,-0.991,-1.0,-1.0,-1.0,-1.0,
1,5.0,-1.0,-1.0,-1.0,-0.813,-0.671,-0.809,-0.887,-0.671,-0.853,...,-0.671,-0.033,0.761,0.762,0.126,-0.095,-0.671,-0.828,-1.0,
2,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-0.109,1.0,-0.179,-1.0,-1.0,-1.0,-1.0,
3,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.273,0.684,0.96,0.45,...,1.0,0.536,-0.987,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
4,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.928,-0.204,0.751,0.466,...,0.639,1.0,1.0,0.791,0.439,-0.199,-0.883,-1.0,-1.0,
5,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.397,0.983,-0.535,-1.0,...,0.015,-0.862,-0.871,-0.437,-1.0,-1.0,-1.0,-1.0,-1.0,
6,3.0,-1.0,-1.0,-1.0,-0.83,0.442,1.0,1.0,0.479,-0.328,...,0.671,0.345,-0.507,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
7,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.51,-0.213,...,0.666,0.301,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,
8,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.454,0.879,-0.745,-1.0,...,1.0,1.0,0.506,-0.174,-0.811,-1.0,-1.0,-1.0,-1.0,
9,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.909,0.801,...,-0.332,0.735,0.38,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,


In [5]:
# Row 0 holds the value of the integer represented by the following 256 values of gray scale 
# Row 257 is NaN because all lines end with a non-printing character

In [6]:
# Lose the last column 
del data[257]

In [7]:
data.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,247,248,249,250,251,252,253,254,255,256
0,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.631,0.862,...,0.304,0.823,1.0,0.482,-0.474,-0.991,-1.0,-1.0,-1.0,-1.0
1,5.0,-1.0,-1.0,-1.0,-0.813,-0.671,-0.809,-0.887,-0.671,-0.853,...,-0.671,-0.671,-0.033,0.761,0.762,0.126,-0.095,-0.671,-0.828,-1.0
2,4.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-0.109,1.0,-0.179,-1.0,-1.0,-1.0,-1.0
3,7.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.273,0.684,0.96,0.45,...,-0.318,1.0,0.536,-0.987,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,3.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.928,-0.204,0.751,0.466,...,0.466,0.639,1.0,1.0,0.791,0.439,-0.199,-0.883,-1.0,-1.0
5,6.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.397,0.983,-0.535,-1.0,...,0.803,0.015,-0.862,-0.871,-0.437,-1.0,-1.0,-1.0,-1.0,-1.0
6,3.0,-1.0,-1.0,-1.0,-0.83,0.442,1.0,1.0,0.479,-0.328,...,1.0,0.671,0.345,-0.507,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
7,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.51,-0.213,...,-0.654,0.666,0.301,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
8,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.454,0.879,-0.745,-1.0,...,1.0,1.0,1.0,0.506,-0.174,-0.811,-1.0,-1.0,-1.0,-1.0
9,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-0.909,0.801,...,-1.0,-0.332,0.735,0.38,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [21]:
# Convert the values in row 0 to integers 
data.apply(pd.to_numeric(data[0,:], downcast='integer'))

TypeError: unhashable type: 'slice'

In [None]:
data.head(10)

In [None]:
# Take the first line of data as a sample
sample = (data.iloc[0,1:256])

In [None]:
# Print the sample 
sample

In [None]:
images = pd.DataFrame(data.iloc[0:1,1:257])

In [None]:
images.shape

In [None]:
image_reshaped = images.values.reshape((16,16))

In [None]:
plt.imshow(image_reshaped, cmap='jet', interpolation='bilinear')
plt.show()

In [None]:
data[data[0] == 6.0]

In [None]:
digit_6 = pd.DataFrame(data[data[0] == 6.0])

In [None]:
mean_image = pd.DataFrame(digit_6.iloc[:,1:257].mean())

In [None]:
image_reshaped = mean_image.values.reshape((16,16))

In [None]:
plt.imshow(image_reshaped, cmap='jet', interpolation='lanczos')
plt.show()