In [2]:
!pip install numpy
!pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m23.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/opt/python@3.10/bin/python3.10 -m pip install --upgrade pip[0m
Collecting pandas
  Downloading pandas-2.0.1-cp310-cp310-macosx_10_9_x86_64.whl (11.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11.8 MB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting tzdata>=2022.1
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m[31m7.7 MB/s[0m eta [36m0:00:01[0m
Installing collected packages: tzdata, pandas
Successfully installed pandas-2.0.1 tzdata-2023.3

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m

# Series in pandas
A set of examples that exhibit some of the core features of the [Series](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.Series.html) data type in the `pandas` module.

## import

In [3]:
import numpy as np
import pandas as pd

## Create a Series

In [4]:
# simple series with automatic numeric indices
x = pd.Series([22, 44, 66, 88])
x

0    22
1    44
2    66
3    88
dtype: int64

In [5]:
# get value with numeric index
x[2]

66

In [8]:
# with a custom indices
y = pd.Series([22, 44, 66, 88], index=['a', 'b', 'c', 'd'])
y

a    22
b    44
c    66
d    88
dtype: int64

In [11]:
# get value with custom index
y['c']

66

In [12]:
# from a Python dictionary
z = pd.Series({'a': 22, 'b': 44, 'c': 66, 'd': 88})
z

a    22
b    44
c    66
d    88
dtype: int64

In [13]:
# from a scalar
a = pd.Series(5, index=[0, 1, 2, 3, 4, 5])
a

0    5
1    5
2    5
3    5
4    5
5    5
dtype: int64

## Data types
Unlike `numpy` ndarrays, a single `pandas` Series can contain a variety of data types.

In [14]:
s = pd.Series( ['hello', 44, True, 3.14, [1, 2, 3] ] )
s

0        hello
1           44
2         True
3         3.14
4    [1, 2, 3]
dtype: object

Convert data types with `astype()`.

In [15]:
# convert to string
s = s.astype(str)
s

0        hello
1           44
2         True
3         3.14
4    [1, 2, 3]
dtype: object

## Lambda expressions

In [16]:
# run a function to transform each value in the series
s.apply(lambda x: 'hello ' + x)

0        hello hello
1           hello 44
2         hello True
3         hello 3.14
4    hello [1, 2, 3]
dtype: object

In [None]:
s

## Naming Series
Series can be given a custom name.

In [17]:
# create a Series with a custom name
x = pd.Series([22, 44, 66, 88], name='non-anonymous Series')
x

0    22
1    44
2    66
3    88
Name: non-anonymous Series, dtype: int64

In [None]:
# get the name
x.name

## Indexing
Accessing values from a `pandas` Series.

In [18]:
# an example Series, with String labels
x = pd.Series({'foo': 22, 'bar': 44, 'baz': 66, 'bum': 88})

In [19]:
x

foo    22
bar    44
baz    66
bum    88
dtype: int64

In [20]:
# access by label
x['bar']

44

In [21]:
# access by position (even with a custom-labeled Series)
x[1]

44

In [22]:
# access by integer index
x.iloc[1]

44

In [23]:
# access by label index
x.loc['bar']

44

In [26]:
# accessing a subset by positions
x[ [0, 1, 2] ]

foo    22
bar    44
baz    66
dtype: int64

In [27]:
# accessing a subset by integer indices
x.iloc[ [0, 1, 2] ]

foo    22
bar    44
baz    66
dtype: int64

In [28]:
# accessing a subset by label indices
x.loc[ ['foo', 'bar', 'baz'] ]

foo    22
bar    44
baz    66
dtype: int64

loc versus iloc

In [30]:
x = pd.Series({1: 22, 0: 44, 3: 66, 2: 88})
x

1    22
0    44
3    66
2    88
dtype: int64

In [33]:
x.loc[1]

22

In [36]:
x.iloc[1]

44

## Slicing
Unlike `numpy` ndarrays, slicing a `pandas` Series will also slice the index.

In [None]:
# slice an automatically-indexed Series
x = pd.Series([22, 44, 66, 88])
x[2 : ]

In [None]:
# the same thing, using iloc
x.iloc[2 : ]

In [37]:
# slice a custom-indexed Series
y = pd.Series([22, 44, 66, 88], index=['a', 'b', 'c', 'd'])
y[2 : ]

c    66
d    88
dtype: int64

Slice syntax within the brackets, `[` and `]`, generally works the same way as regular Python list slices and `numpy` slices.

## Sorting

In [41]:
# unsorted
y = pd.Series([44, 88, 22, 66], index=['a', 'b', 'c', 'd'])
y

a    44
b    88
c    22
d    66
dtype: int64

In [42]:
# sorted by index
y.sort_index(ascending=False)

d    66
c    22
b    88
a    44
dtype: int64

In [43]:
# sorted by value
y.sort_values(ascending=True)

c    22
a    44
d    66
b    88
dtype: int64

## Introspection
Accessing some metadata about a Series

In [46]:
# the data type of the Series
x = pd.Series([22, 44, 66, 88])
x.dtype

dtype('int64')

In [47]:
# the shape of the Series... in this case a one-dimensional array with 4 values
y = pd.Series([22, 44, 66, 88], index=['a', 'b', 'c', 'd'])
y.shape

(4,)

In [48]:
# get the name of a named series
x = pd.Series([22, 44, 66, 88], name="non-anonymous Series")
x.name

'non-anonymous Series'

## Simple math operations

In [49]:
# add a scalar to all values in a Series
x = pd.Series([22, 44, 66, 88])
y = x + 2
y

0    24
1    46
2    68
3    90
dtype: int64

In [50]:
# subtract a scalar from all values in a Series
x = pd.Series([22, 44, 66, 88])
x - 2

0    20
1    42
2    64
3    86
dtype: int64

In [51]:
# divid all values in a Series by a scalar
x = pd.Series([22, 44, 66, 88])
x / 11

0    2.0
1    4.0
2    6.0
3    8.0
dtype: float64

... and so on

In [52]:
x > 50

0    False
1    False
2     True
3     True
dtype: bool

In [53]:
x != 44

0     True
1    False
2     True
3     True
dtype: bool

In [54]:
# add two series together
x = pd.Series([22, 44, 66, 88])
x + x

0     44
1     88
2    132
3    176
dtype: int64

In [55]:
# add two series together
x = pd.Series([22, 44, 66, 88], index=['a', 'b', 'c', 'd'])
y = pd.Series([1, 2, 3, 4], index=['d', 'c', 'a', 'b'])
x + y

a    25
b    48
c    68
d    89
dtype: int64

## Math operations and the alignment of labels
Unlike `numpy` ndarrays, operations on Series automatically align by labels.

In [None]:
# for example, take two Series with the same set of labels, but in different orders
a = pd.Series({'foo': 22, 'bar': 44, 'baz': 66, 'bum': 88})
b = pd.Series({'bum': 1, 'baz': 2, 'bar': 3, 'foo': 4, })

In [None]:
# math operations will be performed on values that share the same label
a + b

Besides this difference, all the basic math operations (+, -, *, /) between two Series work the same way as in `numpy` ndarrays.

## Heads and tails
When dealing with large amounts of data, it's sometimes useful to see a sample of the data, without viewing the entire data set.  The `head()`, `tail()`, and `sample()` functions can help with this.

In [57]:
# first, let's generate a large Series

import numpy as np # import numpy for convience generating a lot of sample data

# make a really big Series from a random numpy ndarray
x = pd.Series( np.random.random(5000) ) 
x

0       0.922407
1       0.060105
2       0.505175
3       0.442808
4       0.069241
          ...   
4995    0.649207
4996    0.053018
4997    0.861794
4998    0.940657
4999    0.603541
Length: 5000, dtype: float64

In [58]:
# get the default of what's in x
x

0       0.922407
1       0.060105
2       0.505175
3       0.442808
4       0.069241
          ...   
4995    0.649207
4996    0.053018
4997    0.861794
4998    0.940657
4999    0.603541
Length: 5000, dtype: float64

In [59]:
# get the head... the first few values
x.head(5)

0    0.922407
1    0.060105
2    0.505175
3    0.442808
4    0.069241
dtype: float64

In [61]:
# get the tail... the last few values
x.tail(2)

4998    0.940657
4999    0.603541
dtype: float64

In [62]:
# get a sample of a few random values
x.sample(5)

4428    0.383560
202     0.154986
2032    0.905526
3794    0.726723
3657    0.789633
dtype: float64

## Basic statistics
Basic statistical functions, like `mean()`, `median()`, `min()`, `max()`, and `std()` work just like their `numpy` equivalents.

In [63]:
# make a linearly-spaced series of 50 values from 1 to 100
x = pd.Series( np.linspace(1, 100, 50) ) 
x.head()

0    1.000000
1    3.020408
2    5.040816
3    7.061224
4    9.081633
dtype: float64

In [None]:
# get an overview of most common stats
x.describe()

In [64]:
# calculate the mean value of the entire Series
x.mean()

50.49999999999999

In [65]:
# calculate the mean value for only those values in the Series that are greater than 50
x[ x < 5 ].mean()

2.010204081632653

The other statistics functions - `min()`, `max()`, `median()`, `std()`, `count()` - work similarly.

In [None]:
x