In [1]:
import pandas as pd

import numpy as np

## Series

Series: A fixed length one dimensional array with a labeled index. 
- The pandas array can hold data/values of different types.  

### Creating a pandas series

Series can be created from;
- an ndarray
- a dictionary
- a scalar value

#### Series from an ndarray

In [2]:
random_data = np.random.randn(10)
random_data

array([-1.03354597, -0.39545713, -1.18984346,  1.02266335,  0.87370677,
        0.34736437, -0.43991983, -0.72961565, -0.71173673,  2.79089807])

> Numerical zero based indices are provided as index labels by default if not specified

In [3]:
pd.Series(random_data)

0   -1.033546
1   -0.395457
2   -1.189843
3    1.022663
4    0.873707
5    0.347364
6   -0.439920
7   -0.729616
8   -0.711737
9    2.790898
dtype: float64

In [4]:
ages = np.array([4,10,15,23,27,30])
pd.Series(ages)

0     4
1    10
2    15
3    23
4    27
5    30
dtype: int64

> The specified index becomes the labels for each data/value in the series(pandas array)

> Note: When creating a pandas series from an ndarray, the index if specified should have a list with labels of the same length as the ndarray.

In [5]:
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])
y

1    -1.033546
2    -0.395457
3    -1.189843
4     1.022663
5     0.873707
6     0.347364
7    -0.439920
8    -0.729616
9    -0.711737
10    2.790898
dtype: float64

#### Series from a dictionary

> When a dictionary is used, the keys become the index labels and the values are the series values

In [6]:
scores = {'math':67, 'science':75, 'english':80, 'social studies':59} # dtype is int because the values are integers
pd.Series(scores)

math              67
science           75
english           80
social studies    59
dtype: int64

In [7]:
# If the value types are mixed, the series dtype defaults to object
unsorted = {'usa':'100', 'china':250, 'india':'3200', 'korea':np.nan, 'malawi':234.56}
pd.Series(unsorted)

usa          100
china        250
india       3200
korea        NaN
malawi    234.56
dtype: object

#### Series from a scalar 

In [8]:
pi = np.pi
pd.Series(pi)

0    3.141593
dtype: float64

In [9]:
pd.Series('person')

0    person
dtype: object

In [10]:
import datetime
pd.Series((datetime.datetime.now()))

0   2025-05-13 13:12:27.938889
dtype: datetime64[ns]

> Inorder to distinguish between series, each can be named differently using the `name` method.

In [11]:
z = np.random.randn(8)
pd.Series(z, name='floats')

0    1.161787
1   -0.627834
2   -1.920944
3    0.991237
4    0.417177
5    0.258782
6   -0.118874
7   -0.337516
Name: floats, dtype: float64

In [12]:
t = z[:3]
pd.Series(t, name='sample data')

0    1.161787
1   -0.627834
2   -1.920944
Name: sample data, dtype: float64

In [13]:
# Sample data renamed, this will create two different series
b = pd.Series(t, name='sample data')
b.rename('selected data')

0    1.161787
1   -0.627834
2   -1.920944
Name: selected data, dtype: float64

In [14]:
# b and t are now different objects
b is t

False

In [15]:
# but the data in them is the same and reference is made to the original object
b == t

0    True
1    True
2    True
Name: sample data, dtype: bool

In [16]:
# but the data in them is the same and reference is made to the original object
t == b

0    True
1    True
2    True
Name: sample data, dtype: bool

### Slicing

> Slicing a pandas series works in a similar way as slicing a list, string, etc with an exception that it also slices the index and includes it in the output.

In [17]:
x = pd.Series(random_data, index=['a','b','c','d','e','f','g','h','i','j'])
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])

#### Accessing a subset of the series

In [18]:
x[:2]

a   -1.033546
b   -0.395457
dtype: float64

In [19]:
x[5:]

f    0.347364
g   -0.439920
h   -0.729616
i   -0.711737
j    2.790898
dtype: float64

In [20]:
y[2:5]

3   -1.189843
4    1.022663
5    0.873707
dtype: float64

#### Accessing individual values using index labels

In [21]:
x['c']

np.float64(-1.1898434587373645)

In [22]:
y[4]

np.float64(1.0226633457447198)

### Converting a series into an array

> Converting a series to an array yields an ndarray

In [23]:
# Using the array method from pandas
pd.array(x)

<NumpyExtensionArray>
[ np.float64(-1.0335459699654528),  np.float64(-0.3954571347216033),
  np.float64(-1.1898434587373645),   np.float64(1.0226633457447198),
   np.float64(0.8737067652382086),  np.float64(0.34736436696416134),
 np.float64(-0.43991982594699086),  np.float64(-0.7296156457927127),
  np.float64(-0.7117367278044872),   np.float64(2.7908980680184894)]
Length: 10, dtype: float64

In [24]:
# Using the to_numpy method from pandas
# First convert the data to a pandas series, then convert to a numpy array
pd.Series(y).to_numpy()

array([-1.03354597, -0.39545713, -1.18984346,  1.02266335,  0.87370677,
        0.34736437, -0.43991983, -0.72961565, -0.71173673,  2.79089807])

> Note: If you would like to perform some calculations/any other numerical computations, converting a pandas series to a numpy array is recommended.

### Vectorization and data alignment

> Pandas series support element wise operations just like numpy arrays, therefore does not require looping through the series to perform operations.  
- Unsupported operations between different values default to `NaN`
- The element wise operations support also allow for proper data alignment using the index of each value.
  - if an index is missing, the value is replaced by `Nan`
  - if the indicies from either serie don't match, a **union** of the values is obtained with the union value as `NaN`

In [25]:
x = pd.Series(random_data, index=['a','b','c','d','e','f','g','h','i','j'])
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])

#### Numerical calculations

In [26]:
x, np.sin(x)

(a   -1.033546
 b   -0.395457
 c   -1.189843
 d    1.022663
 e    0.873707
 f    0.347364
 g   -0.439920
 h   -0.729616
 i   -0.711737
 j    2.790898
 dtype: float64,
 a   -0.859119
 b   -0.385230
 c   -0.928311
 d    0.853499
 e    0.766714
 f    0.340421
 g   -0.425867
 h   -0.666583
 i   -0.653150
 j    0.343550
 dtype: float64)

In [27]:
x, x + 5

(a   -1.033546
 b   -0.395457
 c   -1.189843
 d    1.022663
 e    0.873707
 f    0.347364
 g   -0.439920
 h   -0.729616
 i   -0.711737
 j    2.790898
 dtype: float64,
 a    3.966454
 b    4.604543
 c    3.810157
 d    6.022663
 e    5.873707
 f    5.347364
 g    4.560080
 h    4.270384
 i    4.288263
 j    7.790898
 dtype: float64)

In [28]:
x, np.exp(x)

(a   -1.033546
 b   -0.395457
 c   -1.189843
 d    1.022663
 e    0.873707
 f    0.347364
 g   -0.439920
 h   -0.729616
 i   -0.711737
 j    2.790898
 dtype: float64,
 a     0.355743
 b     0.673372
 c     0.304269
 d     2.780591
 e     2.395775
 f     1.415332
 g     0.644088
 h     0.482094
 i     0.490791
 j    16.295648
 dtype: float64)

In [29]:
x, x * 10

(a   -1.033546
 b   -0.395457
 c   -1.189843
 d    1.022663
 e    0.873707
 f    0.347364
 g   -0.439920
 h   -0.729616
 i   -0.711737
 j    2.790898
 dtype: float64,
 a   -10.335460
 b    -3.954571
 c   -11.898435
 d    10.226633
 e     8.737068
 f     3.473644
 g    -4.399198
 h    -7.296156
 i    -7.117367
 j    27.908981
 dtype: float64)

In [30]:
x, x > 0.5

(a   -1.033546
 b   -0.395457
 c   -1.189843
 d    1.022663
 e    0.873707
 f    0.347364
 g   -0.439920
 h   -0.729616
 i   -0.711737
 j    2.790898
 dtype: float64,
 a    False
 b    False
 c    False
 d     True
 e     True
 f    False
 g    False
 h    False
 i    False
 j     True
 dtype: bool)

#### Data Alignment

> If the indices in each serie is the same and the operation between the elements is supported, the computation is successful, otherwise the value is replaced with `NaN`

In [31]:
# Both x and x have the same indicies and the values support the addition operation
x + x

a   -2.067092
b   -0.790914
c   -2.379687
d    2.045327
e    1.747414
f    0.694729
g   -0.879840
h   -1.459231
i   -1.423473
j    5.581796
dtype: float64

In [32]:
# Only the first four elements will be successful because the indicies match and the elements present support addition
union = x + x[:4]
union

a   -2.067092
b   -0.790914
c   -2.379687
d    2.045327
e         NaN
f         NaN
g         NaN
h         NaN
i         NaN
j         NaN
dtype: float64

In [33]:
# The rest of the values in the union formed i.e from index 5 to the end are replaced with NaN
union[5:]

f   NaN
g   NaN
h   NaN
i   NaN
j   NaN
dtype: float64

In [34]:
# The NaN values can be dropped using pd.Series(some_data).dropna
pd.Series(union).dropna()

a   -2.067092
b   -0.790914
c   -2.379687
d    2.045327
dtype: float64

In [35]:
# Each Nan value is of np.dtype(nan)
union['j']

np.float64(nan)

In [36]:
# All the indicies don't match so a union of both series is obtained with their values replaced with NaN
c = x + y
c

a    NaN
b    NaN
c    NaN
d    NaN
e    NaN
f    NaN
g    NaN
h    NaN
i    NaN
j    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
dtype: float64

In [37]:
c['a'], c[4]

(np.float64(nan), np.float64(nan))

In [38]:
z = pd.Series([1,2,3,4,5,6,7], index=['a','b','c','d',5,6,7])

In [39]:
x + z

5         NaN
6         NaN
7         NaN
a   -0.033546
b    1.604543
c    1.810157
d    5.022663
e         NaN
f         NaN
g         NaN
h         NaN
i         NaN
j         NaN
dtype: float64

In [40]:
y + z

1          NaN
2          NaN
3          NaN
4          NaN
5     5.873707
6     6.347364
7     6.560080
8          NaN
9          NaN
10         NaN
a          NaN
b          NaN
c          NaN
d          NaN
dtype: float64

## DataFrame