# Querying Series

In [1]:
# import pandas
import pandas as pd

In [5]:
# create a dictionary with cities and temperatures
temperatures = {
    'Athens':45.4,
    'Volos':33.3,
    'Patra':22.0,
    'Ioannina': -5.3
}

# create a Series from the dictionary
pd.Series(temperatures)

# The first column is the index, and the second is the value

Athens      45.4
Volos       33.3
Patra       22.0
Ioannina    -5.3
dtype: float64

In [10]:
# loc Vs iloc
# Note that both are NOT functions (i.e. do not need parenthesis) BUT are attributes of a Series object.
# iloc is the position attribute (i.e. used to find data based on position)
# loc is the index attribute (i.e. used to find data based on index value)

s = pd.Series(temperatures)

# example 1: Find the temperature of the third city 
# Do not forget - we start at zero
print(s.iloc[2])

# example 2: Find the temperature of 'Athens'
print(s.loc['Athens'])

# Catch: We can query by position and index without the iloc and loc 
# Example 3
print(s[2] == s.iloc[2])
print(s['Athens'] == s.loc['Athens'])

22.0
45.4
True
True


In [30]:
# BUT we should be careful when indexes are also integers

# create a Series from a list
ls = [1, 2, 45]

s = pd.Series(ls, index=[1,2,3])
s

# query by position without the iloc attribute to find the first item
# results to #keyerror because there is no index with value 0
# s[0] # keyerror

#instead the following is not quyring for the 4th item - there are three items in the Series
# BUT looking for the item having as index the number 3 - which in our case is 45.
s[3]


45

# Series broadcasting

In [35]:
import numpy as np

# define a Series of random integers
# 100K of integers between 1 and 1000
s = pd.Series(np.random.randint(1,1000, 100000))

# show first 5 values
print(s.head())

# multiply each item in the Series by 2
# Broadcasting => Changing the values of all elements in a Series by passing the Series as variable
s *= 2
print(s.head())

0    519
1    680
2    565
3    274
4    838
dtype: int64
0    1038
1    1360
2    1130
3     548
4    1676
dtype: int64


# Time it

In [54]:
%%timeit -n 100
# calculate average of all items and calculate time to execute aggregation

# Case 1: Looping Series the traditional way
# Remember that we had doubled the value of each element in Series s
total_sum = 0
for label, value in s.iteritems():
    # Increase by 100 each value of the series
    total_sum += value

9.77 ms ± 219 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [57]:
%%timeit -n 100

# calculate average of all items and calculate time to execute aggregation
# actually the same but without using a loop to calculate the sum of all items in the Series

total_sum = s.sum() / len(s)

457 µs ± 119 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


# Appending a Series

In [62]:
# Let's examine how appending a Series works

# create a Series of 5 integers between 1 and 10
s = pd.Series(np.random.randint(1,10,5))
s

0    9
1    4
2    3
3    1
4    9
dtype: int64

In [65]:
# create a new Series of 3 strings, which have the same index
s2 = pd.Series(['John', 'James', 'Jack'], index=[5,5,5])
s2

5     John
5    James
5     Jack
dtype: object

In [67]:
# Append the last Series to the first one
# See that:
# a. A Series object may have items with the same index
# b. A Series object as we said is like a list, i.e. may contain items of different times
# c. Appending the original Series does not change it, but creates a new Series object
s3 = s.append(s2)
s3

0        9
1        4
2        3
3        1
4        9
5     John
5    James
5     Jack
dtype: object