# Series Data Structure

In [2]:
import pandas as pd

In [2]:
# When one creates a series by passing in a list of values
# Pandas automatically assigns an index starting with zero and
# sets the name of the series to None.

students = ['Alice', 'Jack', 'Molly']

pd.Series(students)

0    Alice
1     Jack
2    Molly
dtype: object

In [3]:
numbers = [1,2,3]

pd.Series(numbers)

0    1
1    2
2    3
dtype: int64

In [4]:
# If we create a list of strings and we have 
# one element, a None type, pandas inserts it as a None and uses the type object for the 
# underlying array. 

students = ['Alice', 'Jack', None]

pd.Series(students)

0    Alice
1     Jack
2     None
dtype: object

In [3]:
# However, if we create a list of numbers, intergers or floats, and put in the None type,
# pandas automatically converts this to a special floating point values designated as NaN,
# which stands for "Not a Number"

numbers = [1, 2, None]

pd.Series(numbers)

0    1.0
1    2.0
2    NaN
dtype: float64

In [6]:
# NaN is *NOT* equivalent to None and when we try yhe equality test, the result is False.

import numpy as np

np.nan == None

False

In [7]:
# One actually can't do an equality test of NaN to itself.

np.nan == np.nan

False

In [8]:
# To test the presence of not a number one needs to use special functions

np.isnan(np.nan)

True

In [10]:
# A Series can be created directly from dictionary data. If one do this,
# the index is automatically assigned to keys of the provided dictionary 
# and not just incrementing integers.

students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
s = pd.Series(students_scores)
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [11]:
s.index

Index(['Alice', 'Jack', 'Molly'], dtype='object')

In [12]:
# The dtype of "object" is not just for strings, but for arbitrary objects. 

students =[('Alice', 'Brown'), ('Jack', 'White'), ('Molly', 'Green')]
pd.Series(students)

0    (Alice, Brown)
1     (Jack, White)
2    (Molly, Green)
dtype: object

In [14]:
# One can separete the index creation from the data by passing in the index
# as a list explicitly to the series

s = pd.Series(['Physics', 'Chemistry', 'English'], index=['Alice', 'Jack', 'Molly'])
s

Alice      Physics
Jack     Chemistry
Molly      English
dtype: object

In [16]:
# So what happens if your list of values in the index object are not aligned with the keys
# in the dictionary used for creating the series?
# Well, Pandas overrides the automatic creation to favor only and all of the indices values
# provided. So it will ignore from the dictionaty all keys that are not in the index, and Pandas
# will add None and Nan type values for any index value you provide, which is not in your
# dictionary key list.

students_scores = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English'}
# Creating a series object with only three students and excluding Jack
s = pd.Series(students_scores, index=['Alice', 'Molly', 'Sam'])
s

Alice    Physics
Molly    English
Sam          NaN
dtype: object

In [None]:
# The result is that the Series object doesn't have Jack in it, even though he was in the
# original dataset, but it explicitly does have Sam in it as a missing value.

# Querying Series

In [17]:
students_classes = {'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'}
s = pd.Series(students_classes)
s

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [18]:
# To see the fourth entry we would we would use the iloc 
# attribute with the parameter 3.
s.iloc[3]

'History'

In [19]:
# To see what class Molly has, we would use the loc attribute with a parameter 
# of Molly.
s.loc['Molly']

'English'

In [21]:
# If an integer parameter is passed, the operator will 
# behave as if you want it to query via the iloc attribute
s[3]

'History'

In [22]:
# If an object is passed, it will query as if you wanted to use the label based loc attribute.
s['Molly']

'English'

In [23]:
# Tf the index is a list of integers, Pandas can't determine automatically whether you're
# intending to query by index position or index label. So the safer option 
# is to be more explicit and use the iloc or loc attributes directly.

class_code = {99: 'Physics',
              100: 'Chemistry',
              101: 'English',
              102: 'History'}
s = pd.Series(class_code)

In [24]:
# If we try and call s[0] we get a key error because there's no item in the classes list with 
# an index of zero, instead we have to call iloc explicitly if we want the first item.

s[0]

KeyError: 0

In [25]:
# A common task is to want to consider all of the values inside of a series and do some sort of 
# operation. This could be trying to find a certain number, or summarizing data or transforming 
# the data in some way. For instance, we could create a Series of integers representing
# student grades, and just try and get an average grade

grades = pd.Series([90, 80, 70, 60])

total = 0
for grade in grades:
    total+=grade
print(total/len(grades))

75.0


In [None]:
# This works, but it's slow. Modern computers can do many tasks simultaneously, especially, 
# but not only, tasks involving mathematics.

# Pandas and the underlying numpy libraries support a method of computation called vectorization. 
# Vectorization works with most of the functions in the numpy library, including the sum function.

In [26]:
# Using numpy

total = np.sum(grades)
print(total/len(grades))

75.0


In [29]:
# Creating a big series of random numbers. 
numbers = pd.Series(np.random.randint(0,1000,10000))
# Now lets look at the top five items in that series to make sure they actually seem random. We
# can do this with the head() function
numbers.head()

0    563
1    153
2    758
3    997
4    634
dtype: int64

In [30]:
# Here, we're actually going to use what's called a cellular magic function. These start with two 
# percentage signs and wrap the code in the current Jupyter cell. The function we're going to use 
# is called timeit. This function will run our code a few times to determine, on average, how long 
# it takes.

In [37]:
%%timeit -n 100
total = 0
for number in numbers:
    total+=number
    
total/len(numbers)

1.67 ms ± 673 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [33]:
# Now, trying with vectorizarion

In [36]:
%%timeit -n 100
total = np.sum(numbers)
total/len(numbers)

162 µs ± 31.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [38]:
# Now examples of a series where the index values were unique. 

students_classes = pd.Series({'Alice': 'Physics',
                   'Jack': 'Chemistry',
                   'Molly': 'English',
                   'Sam': 'History'})
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [39]:
kelly_classes = pd.Series(['Philosophy', 'Arts', 'Math'], index=['Kelly', 'Kelly', 'Kelly'])
kelly_classes

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [40]:
all_students_classes = students_classes.append(kelly_classes)
all_students_classes

Alice       Physics
Jack      Chemistry
Molly       English
Sam         History
Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object

In [41]:
# By printing the original series we can see that that series hasn't changed.
students_classes

Alice      Physics
Jack     Chemistry
Molly      English
Sam        History
dtype: object

In [42]:
# Finally, we see that when we query the appended series for Kelly, we don't get a single value, 
# but a series itself. 
all_students_classes.loc['Kelly']

Kelly    Philosophy
Kelly          Arts
Kelly          Math
dtype: object