In [22]:
import pandas
import numpy
import csv

In [23]:
iris_file = 'iris_data.csv'
names = ['sepal_length', 'sepal_width', 'petal_length','petal_width','target']
iris = pandas.read_csv(iris_file, sep=',', decimal='.', header=None,names=names)

In [24]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [26]:
y = iris['target']
y.head()

0    Iris-setosa
1    Iris-setosa
2    Iris-setosa
3    Iris-setosa
4    Iris-setosa
Name: target, dtype: object

In [27]:
x = iris[['sepal_length', 'sepal_width']]
x.head()

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [28]:
print(x.shape)
print(y.shape)

(150, 2)
(150,)


In [29]:
x.mean(axis=0)

sepal_length    5.843333
sepal_width     3.054000
dtype: float64

## Chunking Dataset

In [30]:
iris_chunks = pandas.read_csv(iris_file, sep=',', decimal='.', header=None,names=names, chunksize=10)

In [31]:
for chunk in iris_chunks:
    print('Shape: {}'.format(chunk.shape))
    print(chunk)

Shape: (10, 5)
   sepal_length  sepal_width  petal_length  petal_width       target
0           5.1          3.5           1.4          0.2  Iris-setosa
1           4.9          3.0           1.4          0.2  Iris-setosa
2           4.7          3.2           1.3          0.2  Iris-setosa
3           4.6          3.1           1.5          0.2  Iris-setosa
4           5.0          3.6           1.4          0.2  Iris-setosa
5           5.4          3.9           1.7          0.4  Iris-setosa
6           4.6          3.4           1.4          0.3  Iris-setosa
7           5.0          3.4           1.5          0.2  Iris-setosa
8           4.4          2.9           1.4          0.2  Iris-setosa
9           4.9          3.1           1.5          0.1  Iris-setosa
Shape: (10, 5)
    sepal_length  sepal_width  petal_length  petal_width       target
10           5.4          3.7           1.5          0.2  Iris-setosa
11           4.8          3.4           1.6          0.2  Iris-setosa
1

Shape: (10, 5)
     sepal_length  sepal_width  petal_length  petal_width          target
140           6.7          3.1           5.6          2.4  Iris-virginica
141           6.9          3.1           5.1          2.3  Iris-virginica
142           5.8          2.7           5.1          1.9  Iris-virginica
143           6.8          3.2           5.9          2.3  Iris-virginica
144           6.7          3.3           5.7          2.5  Iris-virginica
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica


## Playing with CSV Library

In [32]:
with open(iris_file, 'rt') as data_stream:
    for n, row in enumerate(csv.DictReader(data_stream, fieldnames=names, dialect='excel')):
        if n==0:
            print(n, row)
        else:
            break


0 {'petal_width': '0.2', 'sepal_width': '3.5', 'sepal_length': '5.1', 'petal_length': '1.4', 'target': 'Iris-setosa'}


In [33]:
with open(iris_file, 'rt') as data_stream:
    for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
        if n==0:
            print(n, row)
        else:
            break

0 ['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']


In [91]:
def batch_read(filename, batch=5):
    with open(filename, 'rt') as data_stream:
        batch_output=list()
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            if n > 0 and n % batch == 0:
                print(batch_output)
                yield(numpy.array(batch_output))
                batch_output = list()
            batch_output.append(row)
        if batch_output and not batch_output[-1]:
            batch_output.pop()
        if batch_output:
            yield(numpy.array(batch_output))

In [92]:
for n, batch in enumerate(batch_read(iris_file, 4)):
    print(n, batch)

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'], ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'], ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa']]
0 [['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']
 ['4.6' '3.1' '1.5' '0.2' 'Iris-setosa']]
[['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'], ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'], ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'], ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa']]
1 [['5.0' '3.6' '1.4' '0.2' 'Iris-setosa']
 ['5.4' '3.9' '1.7' '0.4' 'Iris-setosa']
 ['4.6' '3.4' '1.4' '0.3' 'Iris-setosa']
 ['5.0' '3.4' '1.5' '0.2' 'Iris-setosa']]
[['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa'], ['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'], ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa']]
2 [['4.4' '2.9' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.1' '1.5' '0.1' 'Iris-setosa']
 ['5.4' '3.7' '1.5' '0.2' 'Iris-setosa'

In [93]:
list(batch_read(iris_file, 10))

[['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'], ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'], ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'], ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'], ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'], ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'], ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'], ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'], ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa']]
[['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'], ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'], ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'], ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'], ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'], ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'], ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'], ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'], ['5.7', '3.8', '1.7', '0.3', 'Iris-setosa'], ['5.1', '3.8', '1.5', '0.3', 'Iris-setosa']]
[['5.4', '3.4', '1.7', '0.2', 'Iris-setosa'], ['5.1', '3.7', '1.5', '0.4', 'Iris-setosa'], ['4.6',

[array([['5.1', '3.5', '1.4', '0.2', 'Iris-setosa'],
        ['4.9', '3.0', '1.4', '0.2', 'Iris-setosa'],
        ['4.7', '3.2', '1.3', '0.2', 'Iris-setosa'],
        ['4.6', '3.1', '1.5', '0.2', 'Iris-setosa'],
        ['5.0', '3.6', '1.4', '0.2', 'Iris-setosa'],
        ['5.4', '3.9', '1.7', '0.4', 'Iris-setosa'],
        ['4.6', '3.4', '1.4', '0.3', 'Iris-setosa'],
        ['5.0', '3.4', '1.5', '0.2', 'Iris-setosa'],
        ['4.4', '2.9', '1.4', '0.2', 'Iris-setosa'],
        ['4.9', '3.1', '1.5', '0.1', 'Iris-setosa']],
       dtype='<U11'), array([['5.4', '3.7', '1.5', '0.2', 'Iris-setosa'],
        ['4.8', '3.4', '1.6', '0.2', 'Iris-setosa'],
        ['4.8', '3.0', '1.4', '0.1', 'Iris-setosa'],
        ['4.3', '3.0', '1.1', '0.1', 'Iris-setosa'],
        ['5.8', '4.0', '1.2', '0.2', 'Iris-setosa'],
        ['5.7', '4.4', '1.5', '0.4', 'Iris-setosa'],
        ['5.4', '3.9', '1.3', '0.4', 'Iris-setosa'],
        ['5.1', '3.5', '1.4', '0.3', 'Iris-setosa'],
        ['5.7', '3.8', '

## Combine datasets

In [94]:
data_dict = {'Col1': range(5), 'Col2':[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'}
data_dict

{'Col1': range(0, 5),
 'Col2': [1.0, 1.0, 1.0, 1.0, 1.0],
 'Col3': 1.0,
 'Col4': 'Hello World!'}

In [95]:
my_dataset = pandas.DataFrame(data_dict)
my_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1.0,1.0,Hello World!
1,1,1.0,1.0,Hello World!
2,2,1.0,1.0,Hello World!
3,3,1.0,1.0,Hello World!
4,4,1.0,1.0,Hello World!


In [96]:
my_dataset.dtypes

Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [98]:
my_dataset.Col1 = my_dataset.Col1.astype(float)

In [99]:
my_dataset.dtypes

Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [102]:
try:
    my_dataset.Col4 = my_dataset.Col4.astype(float)
except ValueError as e:
    print('Unable to convert field: {}'.format(e))

Unable to convert field: could not convert string to float: 'Hello World!'
