# Data Munging

## Data loading and preprocessing with pandas

### Fast and easy data loading

In [4]:
import pandas as pd
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
'target'])

In [5]:
# If the dataset is not available online, you can follow these steps to
# download it from the Internet:
try:
    import urllib.request as urllib2
except ImportError:
    import urllib2
#import urllib2
url = "http://aima.cs.berkeley.edu/data/iris.csv"
set1 = urllib2.Request(url)
iris_p = urllib2.urlopen(set1)
iris_other = pd.read_csv(iris_p, sep=',', decimal='.',
header=None, names= ['sepal_length', 'sepal_width',
'petal_length', 'petal_width', 'target'])
iris_other.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [6]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [7]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [8]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [9]:
iris.columns

Index([u'sepal_length', u'sepal_width', u'petal_length', u'petal_width',
       u'target'],
      dtype='object')

In [10]:
Y = iris['target']
Y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
5         Iris-setosa
6         Iris-setosa
7         Iris-setosa
8         Iris-setosa
9         Iris-setosa
10        Iris-setosa
11        Iris-setosa
12        Iris-setosa
13        Iris-setosa
14        Iris-setosa
15        Iris-setosa
16        Iris-setosa
17        Iris-setosa
18        Iris-setosa
19        Iris-setosa
20        Iris-setosa
21        Iris-setosa
22        Iris-setosa
23        Iris-setosa
24        Iris-setosa
25        Iris-setosa
26        Iris-setosa
27        Iris-setosa
28        Iris-setosa
29        Iris-setosa
            ...      
120    Iris-virginica
121    Iris-virginica
122    Iris-virginica
123    Iris-virginica
124    Iris-virginica
125    Iris-virginica
126    Iris-virginica
127    Iris-virginica
128    Iris-virginica
129    Iris-virginica
130    Iris-virginica
131    Iris-virginica
132    Iris-virginica
133    Iris-virginica
134    Iri

In [11]:
X = iris[['sepal_length', 'sepal_width']]
X

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [12]:
X.shape

(150, 2)

In [13]:
Y.shape

(150,)

### Dealing with problematic data

In [14]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,,32,3
5,20140915,,57.0,42,2


In [15]:
fake_dataset = pd.read_csv('a_loading_example_1.csv',
parse_dates=[0])
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,,32,3
5,2014-09-15,,57.0,42,2


In [16]:
fake_dataset.fillna(50)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,50.0,32,3
5,2014-09-15,50.0,57.0,42,2


In [17]:
fake_dataset.fillna(-1)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,-1.0,32,3
5,2014-09-15,-1.0,57.0,42,2


In [18]:
fake_dataset.fillna(fake_dataset.mean(axis=0))

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,42.8,32,3
5,2014-09-15,80.4,57.0,42,2


In [19]:
bad_dataset = pd.read_csv('a_loading_example_2.csv',
error_bad_lines=False)

Skipping line 4: expected 3 fields, saw 4



### Dealing with big datasets

In [20]:
import pandas as pd
iris_chunks = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
    print chunk.shape
    print chunk

(10, 5)
    C1   C2   C3   C4           C5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.7  1.5  0.2  Iris-setosa
1  4.8  3.4  1.6  0.2  Iris-setosa
2  4.8  3.0  1.4  0.1  Iris-setosa
3  4.3  3.0  1.1  0.1  Iris-setosa
4  5.8  4.0  1.2  0.2  Iris-setosa
5  5.7  4.4  1.5  0.4  Iris-setosa
6  5.4  3.9  1.3  0.4  Iris-setosa
7  5.1  3.5  1.4  0.3  Iris-setosa
8  5.7  3.8  1.7  0.3  Iris-setosa
9  5.1  3.8  1.5  0.3  Iris-setosa
(10, 5)
    C1   C2   C3   C4           C5
0  5.4  3.4  1.7  0.2  Iris-setosa
1  5.1  3.7  1.5  0.4  Iris-setosa
2  4.6  3.6  1.0  0.2  Iris-setosa
3  5.1  3.3  1.7  0.5  Iris-setosa
4  4.8  3.4  1.9  0.2  Iris-set

In [21]:
iris_iterator = pd.read_csv(iris_filename, header=None,
names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)

In [22]:
print iris_iterator.get_chunk(10).shape

(10, 5)


In [23]:
print iris_iterator.get_chunk(20).shape

(20, 5)


In [24]:
piece = iris_iterator.get_chunk(2)
piece

Unnamed: 0,C1,C2,C3,C4,C5
0,4.8,3.1,1.6,0.2,Iris-setosa
1,5.4,3.4,1.5,0.4,Iris-setosa


In [25]:
import csv
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.DictReader(data_stream,
        fieldnames = ['sepal_length', 'sepal_width',
        'petal_length', 'petal_width', 'target'],
        dialect='excel')):
            if n== 0:
                print n,row
            else:
                break

0 {'sepal_width': '3.5', 'petal_width': '0.2', 'target': 'Iris-setosa', 'sepal_length': '5.1', 'petal_length': '1.4'}


In [26]:
with open(iris_filename, 'rb') as data_stream:
    for n, row in enumerate(csv.reader(data_stream,
        dialect='excel')):
            if n==0:
                print row
            else:
                break

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']


In [27]:
def batch_read(filename, batch=5):
    # open the data stream
    with open(filename, 'rb') as data_stream:
        # reset the batch
        batch_output = list()
        # iterate over the file
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            # if the batch is of the right size
            if n > 0 and n % batch == 0:
                # yield back the batch as an ndarray
                yield(np.array(batch_output))
                # reset the batch and restart
                batch_output = list()
            # otherwise add the row to the batch
            batch_output.append(row)
        # when the loop is over, yield what's left
        yield(np.array(batch_output))

In [28]:
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
    print batch_input
    break

[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']]


### Accessing other data formats

In [29]:
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
[1.0]*5, 'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1.0,1.0,Hello World!
1,1,1.0,1.0,Hello World!
2,2,1.0,1.0,Hello World!
3,3,1.0,1.0,Hello World!
4,4,1.0,1.0,Hello World!


In [30]:
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':
'string', 'Col3': range(2)})

ValueError: arrays must all be same length

In [31]:
my_own_dataset.dtypes

Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [32]:
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes

Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

### Data preprocessing

In [33]:
mask_feature = iris['sepal_length'] > 6.0
mask_feature

0      False
1      False
2      False
3      False
4      False
5      False
6      False
7      False
8      False
9      False
10     False
11     False
12     False
13     False
14     False
15     False
16     False
17     False
18     False
19     False
20     False
21     False
22     False
23     False
24     False
25     False
26     False
27     False
28     False
29     False
       ...  
120     True
121    False
122     True
123     True
124     True
125     True
126     True
127     True
128     True
129     True
130     True
131     True
132     True
133     True
134     True
135     True
136     True
137     True
138    False
139     True
140     True
141     True
142    False
143     True
144     True
145     True
146     True
147     True
148     True
149    False
Name: sepal_length, dtype: bool

In [34]:
mask_target = iris['target'] == 'Iris-virginica'

In [35]:
iris.loc[mask_target, 'target'] = 'New label'

In [36]:
iris['target'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'New label'], dtype=object)

In [37]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,5.006,3.418,1.464,0.244
Iris-versicolor,5.936,2.77,4.26,1.326
New label,6.588,2.974,5.552,2.026


In [38]:
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width
target,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Iris-setosa,0.124249,0.14518,0.030106,0.011494
Iris-versicolor,0.266433,0.098469,0.220816,0.039106
New label,0.404343,0.104004,0.304588,0.075433


In [39]:
iris.sort_index(by='sepal_length').head()

  if __name__ == '__main__':


Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
13,4.3,3.0,1.1,0.1,Iris-setosa
42,4.4,3.2,1.3,0.2,Iris-setosa
38,4.4,3.0,1.3,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
41,4.5,2.3,1.3,0.3,Iris-setosa


In [40]:
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)

In [41]:
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)

### Data selection

In [42]:
import pandas as pd
dataset = pd.read_csv('a_selection_example_1.csv')
dataset

Unnamed: 0,n,val1,val2,val3
0,100,10,10,C
1,101,10,20,C
2,102,10,30,B
3,103,10,40,B
4,104,10,50,A


In [43]:
dataset = pd.read_csv('a_selection_example_1.csv', index_col=0)
dataset

Unnamed: 0_level_0,val1,val2,val3
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,10,10,C
101,10,20,C
102,10,30,B
103,10,40,B
104,10,50,A


In [44]:
dataset['val3'][104]

'A'

In [45]:
dataset.loc[104, 'val3']

'A'

In [46]:
dataset.ix[104, 'val3']

'A'

In [47]:
dataset.ix[104, 2]

'A'

In [48]:
dataset.iloc[4, 2]

'A'

In [49]:
dataset[['val3', 'val2']][0:2]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [50]:
dataset.loc[range(100, 102), ['val3', 'val2']]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [51]:
dataset.ix[range(100, 102), ['val3', 'val2']]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [52]:
dataset.ix[range(100, 102), [2, 1]]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [53]:
dataset.iloc[range(2), [2,1]]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


## Working with categorical and textual data

In [54]:
import pandas as pd
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy',
'rainy', 'foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping

Unnamed: 0,cloudy,foggy,rainy,snowy,sunny
0,0.0,0.0,0.0,0.0,1.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0


In [55]:
mapping['sunny']

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: sunny, dtype: float64

In [56]:
mapping['cloudy']

0    0.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: cloudy, dtype: float64

In [57]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],
[fit_levs[4]]])
print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()

[[ 0.  0.  0.  0.  1.]]
[[ 1.  0.  0.  0.  0.]]


## A special type of data–text

In [60]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)

KeyboardInterrupt: 

In [59]:
twenty_sci_news.data[0]

NameError: name 'twenty_sci_news' is not defined

In [None]:
twenty_sci_news.filenames

In [None]:
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
word_count.shape

In [None]:
print word_count[0]

In [None]:
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print "Word:", word_list[n], "appears", word_count[0, n], "times"

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print "Word:", word_list[n], "has frequency", word_freq[0, n]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
    print "Word:", word_list[n], "has tfidf", word_tfidf[0, n]

In [None]:
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents

In [None]:
# That is what we say above, the default one
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),
stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [None]:
# Now a bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [None]:
# Now a uni- and bi-gram count vectorizer
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print "Word list = ", word_list
print "text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices]

In [None]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

## Creating NumPy arrays

### From lists to unidimensional arrays

In [None]:
import numpy as np
# Transform a list into a uni-dimensional array
list_of_ints = [1,2,3]
Array_1 = np.array(list_of_ints)
Array_1

In [None]:
Array_1[1] # let's output the second value

In [None]:
type(Array_1)

In [None]:
Array_1.dtype # Note: The default dtype depends on the system you're operating.

### Controlling the memory size

In [None]:
import numpy as np
Array_1.nbytes # Please note that on 64bit platforms the result will be 24.

In [None]:
Array_1 = np.array(list_of_ints, dtype= 'int8')

In [None]:
Array_1b = Array_1.astype('float32')
Array_1b

### Heterogeneous lists

In [None]:
import numpy as np
complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']
Array_2 = np.array(complex_list[:3]) # at first the input list is just ints
print 'complex_list[:3]', Array_2.dtype
Array_2 = np.array(complex_list[:6]) # then it is ints and floats
print 'complex_list[:6]', Array_2.dtype
Array_2 = np.array(complex_list) # finally we add strings
print 'complex_list[:] ',Array_2.dtype

In [None]:
# Check if a NumPy array is of the desired numeric type
print isinstance(Array_2[0],np.number)

### From lists to multidimensional arrays

In [None]:
import numpy as np
# Transform a list into a bidimensional array
a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
Array_2D = np.array(a_list_of_lists )
Array_2D

In [None]:
Array_2D[1,1]

In [None]:
# Transform a list into a multi-dimensional array
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],
[[7,8],[9,10],[11,12]]]
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D

In [None]:
Array_3D[0,2,0] # Accessing the 5th element

In [None]:
np.array({1:2,3:4,5:6}.items())

### Resizing arrays

In [None]:
import numpy as np
# Restructuring a NumPy array shape
original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)
# Attention because reshape creates just views, not copies
original_array[0] = -1

In [None]:
Array_a

In [None]:
Array_c

In [None]:
Array_b

In [None]:
original_array.resize(4,2)
original_array

In [None]:
original_array.shape = (4,2)

In [None]:
original_array

### Arrays derived from NumPy functions

In [None]:
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values

In [None]:
np.arange(9)[::-1]

In [None]:
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)

In [None]:
np.zeros((3,3))

In [None]:
np.ones((3,3))

In [None]:
np.eye(3)

In [None]:
fractions = np.linspace(start=0, stop=1, num=10)
fractions

In [None]:
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth

In [None]:
std_gaussian = np.random.normal(size=(3,3))
std_gaussian

In [None]:
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian

In [None]:
np.random.uniform(low=0.0, high=1.0, size=(3,3))

### Getting an array directly from a file

In [None]:
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)

In [None]:
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)

### Extracting data from pandas

In [None]:
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)

In [None]:
housing_array = housing.values
housing_array.dtype

In [None]:
housing.dtypes

## NumPy fast operation and computations

In [None]:
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a

In [None]:
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b

In [None]:
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2

In [None]:
print a2

In [None]:
np.sum(a2, axis=0)

In [None]:
np.sum(a2, axis=1)

In [None]:
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0

In [None]:
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]

In [None]:
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))

### Matrix operations

In [None]:
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M

In [None]:
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print coefs_matrix

In [None]:
np.dot(M,coefs)

In [None]:
np.dot(coefs,M)

In [None]:
np.dot(M,coefs_matrix)

## Slicing and indexing with NumPy arrays

In [None]:
import numpy as np
M = np.arange(10*10, dtype=int).reshape(10,10)

In [None]:
M[2:9:2,:]

In [None]:
M[2:9:2,5:]

In [None]:
M[2:9:2,5::-1]

In [None]:
# In the book the output of this cell is wrong.
# Here is reported the correct output.

row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]

In [None]:
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]

In [None]:
row_index = [1,1,2,7]
col_index = [0,2,4,8]

In [None]:
M[row_index,col_index]

In [None]:
M[row_index,:][:,col_index]

In [None]:
N = M[2:9:2,5:].copy()

### Stacking NumPy arrays

In [None]:
import numpy as np
dataset = np.arange(10*5).reshape(10,5)

In [None]:
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)

In [None]:
np.vstack((dataset,single_line))

In [None]:
np.vstack((dataset,a_few_lines))

In [None]:
np.vstack((dataset,single_line,single_line))

In [None]:
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))

In [None]:
bias = np.ones(10)
np.column_stack((dataset,bias))

In [None]:
np.dstack((dataset*1,dataset*2,dataset*3))

In [None]:
np.insert(dataset, 3, bias, axis=1)

In [None]:
np.insert(dataset, 3, dataset.T, axis=1)

In [None]:
np.insert(dataset, 3, np.ones(5), axis=0)