## Here I´m documenting some exercises using the book _Python DataScience HandBook_, I will start from the second chapter.

# **Chapter 2: Introduction to NumPy**

In [1]:
import numpy as np

In [None]:
x = 1000

In [None]:
# Creating arrays from python lists:

np.array([1,2,3,4,5,6])

np.array([1,2.3,4,5,6])

np.array(['a',2,3])

# All values inside a numpy array have the same type, if not it will try to upcats them

array(['a', '2', '3'], dtype='<U21')

In [None]:
np.full((3,5), r.randint(1,9))

array([[2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2],
       [2, 2, 2, 2, 2]])

In [None]:
np.zeros((2,3), dtype = int)

array([[0, 0, 0],
       [0, 0, 0]])

In [None]:
np.arange(0,10,2)

array([0, 2, 4, 6, 8])

In [None]:
x = np.arange(0,10,2.5)
x

array([0. , 2.5, 5. , 7.5])

In [None]:
x[::2]

array([0., 5.])

In [None]:
%timeit 3*3

5.31 ns ± 0.412 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)


In [None]:
%run "/content/drive/MyDrive/test_script.py"

1 squared is 1
2 squared is 4
3 squared is 9


In [None]:
!ls /content/drive/MyDrive/Datasets

'AdventureWorks Sales.gsheet'	 FY18_4050_FMRs.csv
 bike_share.csv			 insurance_premiums.csv
 college_datav3.csv		 schoolimprovement2010grants.csv
 daily_show_guests_cleaned.csv


In [None]:
# Reshaping Arrays

grid = np.arange(1,10).reshape((3,3))
grid


array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [None]:
x = np.array([1,2,3])
x.reshape((1,3))

array([[1, 2, 3]])

In [None]:
horizontal = x[np.newaxis,:]
vertical = x[:,np.newaxis]

print(horizontal, "\n",vertical)

[[1 2 3]] 
 [[1]
 [2]
 [3]]


In [None]:
# Concatenation

y = np.arange(1,4)

z = np.concatenate([x,y])

z

array([1, 2, 3, 1, 2, 3])

### NumPy Universal Functions

Computation on NumPy arrays can be very fast, or it can be very slow. The key to
making it fast is to use vectorized operations, generally implemented through Num‐
Py’s universal functions (ufuncs)

In [None]:
# To have fastest results, we applied vectorized ufuncs, it´s applied to each entry

x= np.arange(4)

print(x + 5)
print(x - 5)
print(x * 2)
print(x / 2)
print(x // 2)
print(-x)
print(x ** 2)
print(x % 2)

[5 6 7 8]
[-5 -4 -3 -2]
[0 2 4 6]
[0.  0.5 1.  1.5]
[0 0 1 1]
[ 0 -1 -2 -3]
[0 1 4 9]
[0 1 0 1]


In [None]:
big_array = np.random.randint(1,1000, size = 1000_000)
big_array

array([262, 332, 460, ..., 126, 643, 940], shape=(1000000,), dtype=int32)

In [None]:
%timeit (1.0 / big_array)

2.46 ms ± 165 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
1.0 / big_array

array([0.00381679, 0.00301205, 0.00217391, ..., 0.00793651, 0.00155521,
       0.00106383], shape=(1000000,))

In [None]:
second_big_array = np.random.randint(1,200, size = 1000_000)
second_big_array

array([ 32,  91, 181, ..., 150, 119,  34], shape=(1000000,), dtype=int32)

In [None]:
%timeit (second_big_array / big_array)

2.9 ms ± 4.06 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [None]:
%timeit np.divide(second_big_array,big_array)

3.26 ms ± 20.7 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# **Chapter 3: Data Manipulation with Pandas**

In [4]:
import pandas as pd
import numpy as np

In [5]:
pd.__version__

'2.2.2'

## Introducing Pandas Objects

1. The Pandas Series Object

In [None]:
data = pd.Series([0.25,0.5,0.75,1.0])
data

Unnamed: 0,0
0,0.25
1,0.5
2,0.75
3,1.0


In [None]:
np_arr = np.array([0.25,0.5,0.75,1.0])
np_arr

array([0.25, 0.5 , 0.75, 1.  ])

In [None]:
data.values

array([0.25, 0.5 , 0.75, 1.  ])

In [None]:
data.index

RangeIndex(start=0, stop=4, step=1)

In [None]:
print(data[0])
print(data[:3])

0.25
0    0.25
1    0.50
2    0.75
dtype: float64


In [None]:
# Using alternative indexing

data = pd.Series(
    [0.25,0.5,0.75,1.0],
    index = ['a','b','c','d']
)
data

Unnamed: 0,0
a,0.25
b,0.5
c,0.75
d,1.0


In [None]:
data['b']

np.float64(0.5)

In [None]:
# Series as specialized dictionary

population = {
    'California': 38332521,
    'Texas': 26448193,
    'New York': 19651127,
    'Florida': 195528
}
pop_series = pd.Series(population)
pop_series

Unnamed: 0,0
California,38332521
Texas,26448193
New York,19651127
Florida,195528


In [None]:
for i in pop_series:
    print(i)

38332521
26448193
19651127
195528


In [None]:
pop_series['California':'New York']

Unnamed: 0,0
California,38332521
Texas,26448193
New York,19651127


In [None]:
# Construction Series Objects

pd.Series(5, index = [100,200,300])

Unnamed: 0,0
100,5
200,5
300,5


In [None]:
pd.Series({2:'a', 1:'b', 3:'c'}, index=[3, 2])

Unnamed: 0,0
3,c
2,a


2. The Pandas DataFrame Object

In [None]:
area_dict = {
    'California': 423967,
    'Texas': 695662,
    'New York': 141297,
    'Florida': 170312,
}

area = pd.Series(area_dict)
area

Unnamed: 0,0
California,423967
Texas,695662
New York,141297
Florida,170312


In [None]:
states = pd.DataFrame({
    'population': pop_series,
    'area': area
})
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,195528,170312


In [None]:
states.index

Index(['California', 'Texas', 'New York', 'Florida'], dtype='object')

In [None]:
df = states.reset_index().rename(columns={'index':'state'}).copy()
df

Unnamed: 0,state,population,area
0,California,38332521,423967
1,Texas,26448193,695662
2,New York,19651127,141297
3,Florida,195528,170312


In [None]:
states.columns

Index(['population', 'area'], dtype='object')

In [None]:
# Constructing DataFrame objects

# From a single Series object

pd.DataFrame(
    data = pop_series,
    columns = ['population']
)

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,195528


In [None]:
# From a list of dicts

data = [{'a':i+1, 'b': 2 * (i+1)} for i in range(3)]

pd.DataFrame(data)

Unnamed: 0,a,b
0,1,2
1,2,4
2,3,6


In [None]:
# From a two-dimensional NumPy array

pd.DataFrame(
    data = np.random.randint(10, size = (3,2)),
    columns = ['foo', 'bar'],
    index = ['a','b','c']
)

Unnamed: 0,foo,bar
a,0,9
b,3,6
c,9,2


In [None]:
# From a NumPy structured array

A = np.zeros(
    3,
    dtype = [('A', 'i8'), ('B', 'f8')]
)
A

array([(0, 0.), (0, 0.), (0, 0.)], dtype=[('A', '<i8'), ('B', '<f8')])

In [None]:
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


3. The Pandas Index Object

In [6]:
ind = pd.Index([2,3,5,7,11])
ind

Index([2, 3, 5, 7, 11], dtype='int64')

In [None]:
# Index as immutable array

ind[1]

np.int64(3)

In [None]:
ind[::2]

Index([2, 5, 11], dtype='int64')

In [7]:
# Different from ap arrays, pd Indexes are not mutable, will prove it with the next example

ind[1] = 56

TypeError: Index does not support mutable operations

In [None]:
# Index as ordered set

indA = pd.Index([1,3,5,7,9])
indB = pd.Index([2,3,5,7,11])

In [None]:
indA.intersection(indB) # intersection

Index([3, 5, 7], dtype='int64')

In [None]:
indA.union(indB)  # union

Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

In [None]:
indA.symmetric_difference(indB) # symmetric difference

Index([1, 2, 9, 11], dtype='int64')

## Data Indexing and Selection

1. Data Selection in Series

In [None]:
data = pd.Series(
    [0.25,0.5,0.75,1],
    index = ['a','b','c','d']
)
data

Unnamed: 0,0
a,0.25
b,0.5
c,0.75
d,1.0


In [None]:
'a' in data

True

In [None]:
# Series as one-dimensional array

# slicing by explicit index
data['a':'c']

Unnamed: 0,0
a,0.25
b,0.5
c,0.75


In [None]:
# slicing by implicit integer index
data[0:2]

Unnamed: 0,0
a,0.25
b,0.5


In [None]:
# masking
data[(data > 0.3) & (data < 0.8)]

Unnamed: 0,0
b,0.5
c,0.75


In [None]:
# fancy indexing
data[['a', 'd']]

Unnamed: 0,0
a,0.25
d,1.0


Indexers: loc, iloc, and ix

These slicing and indexing conventions can be a source of confusion. For example, if
your Series has an explicit integer index, an indexing operation such as data[1] will
use the explicit indices, while a slicing operation like data[1:3] will use the implicit
Python-style index.

In [None]:
data = pd.Series(
    ['a','b','c'],
    index = [1,3,5]
)
data

Unnamed: 0,0
1,a
3,b
5,c


In [None]:
data[1]

'a'

In [None]:
data[1:3]

Unnamed: 0,0
3,b
5,c


In [None]:
# the loc attribute allows indexing and slicing that always references the explicit index
data.loc[1]

'a'

In [None]:
data.loc[1:3]

Unnamed: 0,0
1,a
3,b


In [None]:
data.loc[1:5]

Unnamed: 0,0
1,a
3,b
5,c


In [None]:
# The iloc attribute allows indexing and slicing that always references the implicit Python-style index:

data.iloc[1]

'b'

In [None]:
data.iloc[1:3]

Unnamed: 0,0
3,b
5,c


2. Data Selection in DataFrame

In [None]:
area = pd.Series({'California': 423967, 'Texas': 695662,
'New York': 141297, 'Florida': 170312,
'Illinois': 149995})

pop = pd.Series({'California': 38332521, 'Texas': 26448193,
'New York': 19651127, 'Florida': 19552860,
'Illinois': 12882135})

data = pd.DataFrame({
    'area':area,
    'pop':pop
})

data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [None]:
data['area']

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


In [None]:
data.area

Unnamed: 0,area
California,423967
Texas,695662
New York,141297
Florida,170312
Illinois,149995


In [None]:
data.index

Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

In [None]:
data.area is data['area']

True

Though this is a useful shorthand, keep in mind that it does not work for all cases!
For example, if the column names are not strings, or if the column names conflict
with methods of the DataFrame, this attribute-style access is not possible. For example,
the DataFrame has a pop() method, so data.pop will point to this rather than the
"pop" column:

In [None]:
data.pop

In [None]:
data.pop is data['pop']

False

3. DataFrame as two-dimensional array

In [None]:
data.values

array([[  423967, 38332521],
       [  695662, 26448193],
       [  141297, 19651127],
       [  170312, 19552860],
       [  149995, 12882135]])

In [None]:
# With this picture in mind, we can do many familiar array-like observations on the DataFrame itself. For example, we can transpose the full DataFrame to swap rows and columns:

T = data.T
T

Unnamed: 0,California,Texas,New York,Florida,Illinois
area,423967,695662,141297,170312,149995
pop,38332521,26448193,19651127,19552860,12882135


In [None]:
data.iloc[:3,:1]

Unnamed: 0,area
California,423967
Texas,695662
New York,141297


In [None]:
data.loc[:'Illinois', :'pop']

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [None]:
data.ix[:3,:'pop'] # Deprecated

AttributeError: 'DataFrame' object has no attribute 'ix'

In [None]:
data.loc[data.area > 150000, ['pop']]

Unnamed: 0,pop
California,38332521
Texas,26448193
Florida,19552860


4. Additional indexing conventions

There are a couple extra indexing conventions that might seem at odds with the preceding
discussion, but nevertheless can be very useful in practice. First, while indexing
refers to columns, slicing refers to rows:

In [None]:
data['Florida':'Illinois']

Unnamed: 0,area,pop
Florida,170312,19552860
Illinois,149995,12882135


In [None]:
# Such slices can also refer to rows by number rather than by index:
data[1:3]

Unnamed: 0,area,pop
Texas,695662,26448193
New York,141297,19651127


In [None]:
# Similarly, direct masking operations are also interpreted row-wise rather than column-wise:
data[data.area > 150000]

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
Florida,170312,19552860
