# Pandas

----

## Data Structures

- Series
- DataFrame

----

In [1]:
import pandas as pd

import numpy as np

## A) Series

Series: A fixed length one dimensional array with a labeled index. 
- The pandas array can hold data/values of different types.  

### 1.0 Creating a pandas series

Series can be created from;
- an ndarray
- a dictionary
- a scalar value

#### 1.1 Series from an ndarray

In [2]:
random_data = np.random.randn(10)

print(random_data)

print(f'\n {type(random_data)}')

[ 0.28991615 -1.06557627  0.68563844  0.73337913  0.25309043 -1.10996651
  2.60563495  0.1854414  -0.12994872 -0.84124894]

 <class 'numpy.ndarray'>


> Numerical zero based indices are provided as index labels by default if not specified

In [3]:
s = pd.Series(random_data)
print(f'Random data: \n{s}\n\nType: {type(s)}')

Random data: 
0    0.289916
1   -1.065576
2    0.685638
3    0.733379
4    0.253090
5   -1.109967
6    2.605635
7    0.185441
8   -0.129949
9   -0.841249
dtype: float64

Type: <class 'pandas.core.series.Series'>


In [4]:
ages = np.array([4,10,15,23,27,30])

s = pd.Series(ages)

print(s)

0     4
1    10
2    15
3    23
4    27
5    30
dtype: int64


> The specified index becomes the labels for each data/value in the series(pandas array)

In [5]:
x = pd.Series(random_data, index=['a','b','c','d','e','f','g','h','i','j'])

print(x)

a    0.289916
b   -1.065576
c    0.685638
d    0.733379
e    0.253090
f   -1.109967
g    2.605635
h    0.185441
i   -0.129949
j   -0.841249
dtype: float64


> Note: When creating a pandas series from an ndarray, the index if specified should have a list with labels of the same length as the ndarray.

In [6]:
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])

print(y)

1     0.289916
2    -1.065576
3     0.685638
4     0.733379
5     0.253090
6    -1.109967
7     2.605635
8     0.185441
9    -0.129949
10   -0.841249
dtype: float64


#### 1.2 Series from a dictionary

> When a dictionary is used, the keys become the index labels and the values are the series values

In [7]:
scores = {'math':67, 'science':75, 'english':80, 'social studies':59} # dtype is int because the values are integers

print(pd.Series(scores))

math              67
science           75
english           80
social studies    59
dtype: int64


In [8]:
# If the value types are mixed, the series dtype defaults to object
unsorted = {'usa':'100', 'china':250, 'india':'3200', 'korea':np.nan, 'malawi':234.56}

print(pd.Series(unsorted))

usa          100
china        250
india       3200
korea        NaN
malawi    234.56
dtype: object


#### 1.3 Series from a scalar 

In [9]:
pi = np.pi

print(pd.Series(pi))

0    3.141593
dtype: float64


In [10]:
print(pd.Series('person'))

0    person
dtype: object


In [11]:
import datetime

s = pd.Series((datetime.datetime.now()))
print(s)

0   2025-05-14 22:37:38.445861
dtype: datetime64[ns]


> Inorder to distinguish between series, each can be named differently using the `name` attribute.

In [12]:
z = np.random.randn(8)

print(pd.Series(z, name='floats'))

0   -2.295335
1    2.052442
2    0.055288
3    1.301486
4    0.380851
5   -0.909100
6    1.193701
7    1.003027
Name: floats, dtype: float64


In [13]:
t = z[:3]

print(pd.Series(t, name='sample data'))

0   -2.295335
1    2.052442
2    0.055288
Name: sample data, dtype: float64


In [14]:
# Sample data renamed, this will create two different series
b = pd.Series(t, name='sample data')

print(b.rename('selected data'))

0   -2.295335
1    2.052442
2    0.055288
Name: selected data, dtype: float64


In [15]:
# b and t are now different objects
print(b is t)

False


In [16]:
# but the data in them is the same and reference is made to the original object
print(b == t)

0    True
1    True
2    True
Name: sample data, dtype: bool


In [17]:
# but the data in them is the same and reference is made to the original object
print(t == b)

0    True
1    True
2    True
Name: sample data, dtype: bool


### 2.0 Slicing

> Slicing a pandas series works in a similar way as slicing a list, string, etc with an exception that it also slices the index and includes it in the output.

In [18]:
x = pd.Series(random_data, index=['a','b','c','d','e','f','g','h','i','j'])
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])

#### 2.1 Accessing a subset of the series

In [19]:
x[:2]

a    0.289916
b   -1.065576
dtype: float64

In [20]:
x[5:]

f   -1.109967
g    2.605635
h    0.185441
i   -0.129949
j   -0.841249
dtype: float64

In [21]:
y[2:5]

3    0.685638
4    0.733379
5    0.253090
dtype: float64

#### 2.2 Accessing individual values using index labels

In [22]:
x['c']

np.float64(0.6856384418500504)

In [23]:
y[4]

np.float64(0.7333791329607311)

### 3.0 Converting a series into an array

> Converting a series to an array yields an ndarray

In [24]:
# Using the array method from pandas
print(pd.array(x))

<NumpyExtensionArray>
[ np.float64(0.28991615262105336),  np.float64(-1.0655762679214613),
   np.float64(0.6856384418500504),   np.float64(0.7333791329607311),
  np.float64(0.25309042773370827),  np.float64(-1.1099665086233503),
   np.float64(2.6056349476300737),  np.float64(0.18544140024393005),
 np.float64(-0.12994872408967786),   np.float64(-0.841248935348726)]
Length: 10, dtype: float64


In [25]:
# Using the to_numpy method from pandas
# First convert the data to a pandas series, then convert to a numpy array
s = pd.Series(y).to_numpy()

print(f"{s} \n\n{type(s)}")

[ 0.28991615 -1.06557627  0.68563844  0.73337913  0.25309043 -1.10996651
  2.60563495  0.1854414  -0.12994872 -0.84124894] 

<class 'numpy.ndarray'>


> Note: If you would like to perform some calculations/any other numerical computations, converting a pandas series to a numpy array is recommended.

### 4.0 Vectorization and data alignment

> Pandas series support element wise operations just like numpy arrays, therefore does not require looping through the series to perform operations.  
- Unsupported operations between different values default to `NaN`
- The element wise operations support also allow for proper data alignment using the index of each value.
  - if an index is missing, the value is replaced by `Nan`
  - if the indicies from either serie don't match, a **union** of the values is obtained with the union value as `NaN`

In [26]:
x = pd.Series(random_data, index=['a','b','c','d','e','f','g','h','i','j'])
y = pd.Series(random_data, index=[1,2,3,4,5,6,7,8,9,10])

#### 4.1 Numerical calculations

In [27]:
x, np.sin(x)

(a    0.289916
 b   -1.065576
 c    0.685638
 d    0.733379
 e    0.253090
 f   -1.109967
 g    2.605635
 h    0.185441
 i   -0.129949
 j   -0.841249
 dtype: float64,
 a    0.285872
 b   -0.875068
 c    0.633167
 d    0.669384
 e    0.250397
 f   -0.895684
 g    0.510665
 h    0.184380
 i   -0.129583
 j   -0.745476
 dtype: float64)

In [28]:
x, x + 5

(a    0.289916
 b   -1.065576
 c    0.685638
 d    0.733379
 e    0.253090
 f   -1.109967
 g    2.605635
 h    0.185441
 i   -0.129949
 j   -0.841249
 dtype: float64,
 a    5.289916
 b    3.934424
 c    5.685638
 d    5.733379
 e    5.253090
 f    3.890033
 g    7.605635
 h    5.185441
 i    4.870051
 j    4.158751
 dtype: float64)

In [29]:
x, np.exp(x)

(a    0.289916
 b   -1.065576
 c    0.685638
 d    0.733379
 e    0.253090
 f   -1.109967
 g    2.605635
 h    0.185441
 i   -0.129949
 j   -0.841249
 dtype: float64,
 a     1.336315
 b     0.344529
 c     1.985039
 d     2.082104
 e     1.288000
 f     0.329570
 g    13.539820
 h     1.203750
 i     0.878140
 j     0.431172
 dtype: float64)

In [30]:
x, x * 10

(a    0.289916
 b   -1.065576
 c    0.685638
 d    0.733379
 e    0.253090
 f   -1.109967
 g    2.605635
 h    0.185441
 i   -0.129949
 j   -0.841249
 dtype: float64,
 a     2.899162
 b   -10.655763
 c     6.856384
 d     7.333791
 e     2.530904
 f   -11.099665
 g    26.056349
 h     1.854414
 i    -1.299487
 j    -8.412489
 dtype: float64)

In [31]:
x, x > 0.5

(a    0.289916
 b   -1.065576
 c    0.685638
 d    0.733379
 e    0.253090
 f   -1.109967
 g    2.605635
 h    0.185441
 i   -0.129949
 j   -0.841249
 dtype: float64,
 a    False
 b    False
 c     True
 d     True
 e    False
 f    False
 g     True
 h    False
 i    False
 j    False
 dtype: bool)

#### 4.2 Data Alignment

> If the indices in each serie is the same and the operation between the elements is supported, the computation is successful, otherwise the value is replaced with `NaN`

In [32]:
# Both x and x have the same indicies and the values support the addition operation
x + x

a    0.579832
b   -2.131153
c    1.371277
d    1.466758
e    0.506181
f   -2.219933
g    5.211270
h    0.370883
i   -0.259897
j   -1.682498
dtype: float64

In [33]:
# Only the first four elements will be successful because the indicies match and the elements present support addition
union = x + x[:4]
union

a    0.579832
b   -2.131153
c    1.371277
d    1.466758
e         NaN
f         NaN
g         NaN
h         NaN
i         NaN
j         NaN
dtype: float64

In [34]:
# The rest of the values in the union formed i.e from index 5 to the end are replaced with NaN
union[5:]

f   NaN
g   NaN
h   NaN
i   NaN
j   NaN
dtype: float64

In [35]:
# The NaN values can be dropped using pd.Series(some_data).dropna
pd.Series(union).dropna()

a    0.579832
b   -2.131153
c    1.371277
d    1.466758
dtype: float64

In [36]:
# Each Nan value is of np.dtype(nan)
union['j']

np.float64(nan)

In [37]:
# All the indicies don't match so a union of both series is obtained with their values replaced with NaN
c = x + y
print(c)

a    NaN
b    NaN
c    NaN
d    NaN
e    NaN
f    NaN
g    NaN
h    NaN
i    NaN
j    NaN
1    NaN
2    NaN
3    NaN
4    NaN
5    NaN
6    NaN
7    NaN
8    NaN
9    NaN
10   NaN
dtype: float64


In [38]:
c['a'], c[4]

(np.float64(nan), np.float64(nan))

In [39]:
z = pd.Series([1,2,3,4,5,6,7], index=['a','b','c','d',5,6,7])

In [40]:
x + z

5         NaN
6         NaN
7         NaN
a    1.289916
b    0.934424
c    3.685638
d    4.733379
e         NaN
f         NaN
g         NaN
h         NaN
i         NaN
j         NaN
dtype: float64

In [41]:
y + z

1          NaN
2          NaN
3          NaN
4          NaN
5     5.253090
6     4.890033
7     9.605635
8          NaN
9          NaN
10         NaN
a          NaN
b          NaN
c          NaN
d          NaN
dtype: float64

------

## B) DataFrame

DataFrame: A fixed length 2D (two-dimensional) data structure. 
- Unlike Series, DataFrames have both index and column values with values within the column being the same or of varying types.

### 1.0 Creating a DataFrame

A DataFrame can be created from:
- A Series
- A dictionary
- A 2D numpy ndarray
- A Structured or record ndarray
- A DataFrame

#### 1.1 DataFrame from Series

In [42]:
s1 = pd.Series({'r': 'RED', 'g': 'GREEN', 'b': 'BLUE'})

pd.DataFrame(s1, columns=['COLOR'])

Unnamed: 0,COLOR
r,RED
g,GREEN
b,BLUE


In [43]:
s2 = pd.Series( ['AND', 'OR', 'NOT'],index=['&&', '||', '!'])

pd.DataFrame(s2, columns=['Operation'])

Unnamed: 0,Operation
&&,AND
||,OR
!,NOT


#### 1.2 DataFrame from a dictionary

> To create multiple rows of data, use many 1D arrays or many Series

##### 1.21 Using a dictionary of Series

In [44]:
dict_series = {'First Series': s1, 'Second Series': s2}

pd.DataFrame(dict_series) # Values for indicies that don't match are replaced with NaN

Unnamed: 0,First Series,Second Series
!,,NOT
&&,,AND
b,BLUE,
g,GREEN,
r,RED,
||,,OR


In [45]:
dict_series = {'First Series': s1, 'Second Series': s2}
df = pd.DataFrame(dict_series)

print(f'Index: {df.index}\nColumns: {df.columns}')

Index: Index(['!', '&&', 'b', 'g', 'r', '||'], dtype='object')
Columns: Index(['First Series', 'Second Series'], dtype='object')


> To override the order of columns or indicies, specify then using the `columns`, and `index` attributes respectively

In [46]:
pd.DataFrame(dict_series, index=['r', 'g', 'b', '&&', '||', '!'], columns=['Second Series', 'First Series'])

Unnamed: 0,Second Series,First Series
r,,RED
g,,GREEN
b,,BLUE
&&,AND,
||,OR,
!,NOT,


##### 1.22 Using a dictionary of dictionaries

In [47]:
dict_dict = {
    'Dict One': {'one': 1, 'two': 2},
    'Dict Two': {'three': 3, 'four': 4}
}

pd.DataFrame(dict_dict)

Unnamed: 0,Dict One,Dict Two
one,1.0,
two,2.0,
three,,3.0
four,,4.0


##### 1.23 Using a dictionary of lists

In [48]:
dict_list = {
    'list-one': [1, 2, 3, 4],
    'list-two': [5, 6, 7, 8],
    'list-three': [9, 10, 11, 12]
}

pd.DataFrame(dict_list, index=['row-1', 'row-2', 'row-3', 'row-4'])

Unnamed: 0,list-one,list-two,list-three
row-1,1,5,9
row-2,2,6,10
row-3,3,7,11
row-4,4,8,12


##### 1.24 Using a dictionary of ndarrays

In [49]:
array_1 = np.array([1, 2, 3, 4])
array_2 = np.arange(4)
array_3 = np.random.randn(4)

dict_ndarrays = {
    'Array 1': array_1,
    'Array 2': array_2,
    'Array 3': array_3
}

pd.DataFrame(dict_ndarrays)

Unnamed: 0,Array 1,Array 2,Array 3
0,1,0,-0.544321
1,2,1,1.4866
2,3,2,-0.512407
3,4,3,-0.168033


#### 1.3 DataFrame from a 2D numpy ndarray

In [50]:
array_2D = np.array([[1, 2, 3, 4], ['a', 'b', 'c', 'd']])

pd.DataFrame(array_2D)

Unnamed: 0,0,1,2,3
0,1,2,3,4
1,a,b,c,d


In [51]:
table = np.array(
    [[1, 2, 3, 9], 
     [3, 0, 1, 3]]
)

pd.DataFrame(table, columns=['L', 'W', 'D', 'P'], index=['TEAM 1', 'TEAM 2'])

Unnamed: 0,L,W,D,P
TEAM 1,1,2,3,9
TEAM 2,3,0,1,3


#### 1.4 DataFrame from a Structured or record ndarray

In [52]:
record = ('MATH', 'PHYSICS', 'COMPUTER SCIENCE', 'ENGINEERING', 'LITERATURE'), np.arange(75, 100, 5)

pd.DataFrame(record, index=['DATA-1', 'DATA-2'])

Unnamed: 0,0,1,2,3,4
DATA-1,MATH,PHYSICS,COMPUTER SCIENCE,ENGINEERING,LITERATURE
DATA-2,75,80,85,90,95


In [53]:
from dataclasses import dataclass

@dataclass
class Student:
    name: str
    age: int
    year: int

student = Student('Peter', 27, 4)

print(student, end='\n\n')

pd.DataFrame((student.name, student.age, student.year), index=['Name', 'Age', 'Year'], columns=['Profile'])

Student(name='Peter', age=27, year=4)



Unnamed: 0,Profile
Name,Peter
Age,27
Year,4


#### 1.5 DataFrame from a DataFrame

In [54]:
df1 = pd.DataFrame(dict_series, index=['r', 'g', 'b', '&&', '||', '!'], columns=['Second Series', 'First Series'])
df2 = pd.DataFrame(df1)

df2

Unnamed: 0,Second Series,First Series
r,,RED
g,,GREEN
b,,BLUE
&&,AND,
||,OR,
!,NOT,


> Note: Data alignment in a DataFrame works in a similar way as in Series. The values of columns are a union of the data provided and any missing values are replaced with `NaN`.

### 2.0 Operations on a DataFrame

In [55]:
df = pd.DataFrame({
    'list-one': [1, 2, 3, 4],
    'list-two': [5, 6, 7, 8],
    'list-three': [9, 10, 11, 12]
})

df

Unnamed: 0,list-one,list-two,list-three
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


#### 2.1 Selecting a column

In [56]:
df['list-one'] # Returns a Serie with the column label as name.

0    1
1    2
2    3
3    4
Name: list-one, dtype: int64

#### 2.2 Selecting a row

##### 2.2 Single row

> Using loc and iloc:
> - The value passed in `loc` is the row `label` whereas the value passed in `iloc` is the `numerical value` of the row.
> - In both cases, a Series is returned.

In [57]:
df.loc[0] # Returns a Serie containing values of all columns in the row with label 0

list-one      1
list-two      5
list-three    9
Name: 0, dtype: int64

In [58]:
df.iloc[0]

list-one      1
list-two      5
list-three    9
Name: 0, dtype: int64

In [59]:
record = ('MATH', 'PHYSICS', 'COMPUTER SCIENCE', 'ENGINEERING', 'LITERATURE'), np.arange(75, 100, 5)
dframe = pd.DataFrame(record, index=['DATA-1', 'DATA-2'])

dframe

Unnamed: 0,0,1,2,3,4
DATA-1,MATH,PHYSICS,COMPUTER SCIENCE,ENGINEERING,LITERATURE
DATA-2,75,80,85,90,95


In [60]:
dframe.loc['DATA-2']

0    75
1    80
2    85
3    90
4    95
Name: DATA-2, dtype: object

In [61]:
dframe.iloc[1]

0    75
1    80
2    85
3    90
4    95
Name: DATA-2, dtype: object

##### 2.2 Multiple rows

> Selecting multiple rows yields a DataFrame

In [62]:
df = pd.DataFrame({
    'list-one': [1, 2, 3, 4],
    'list-two': [5, 6, 7, 8],
    'list-three': [9, 10, 11, 12]
})

df

Unnamed: 0,list-one,list-two,list-three
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [63]:
# Select all rows
df[:]

Unnamed: 0,list-one,list-two,list-three
0,1,5,9
1,2,6,10
2,3,7,11
3,4,8,12


In [64]:
# Select the last row
df[-1:]

Unnamed: 0,list-one,list-two,list-three
3,4,8,12


In [65]:
# Select rows 2 and 3
df[1:3] # The row at index 3 is not included

Unnamed: 0,list-one,list-two,list-three
1,2,6,10
2,3,7,11


In [66]:
# Transpose the DataFrame
df.transpose() # or df.T

Unnamed: 0,0,1,2,3
list-one,1,2,3,4
list-two,5,6,7,8
list-three,9,10,11,12


In [67]:
df.T

Unnamed: 0,0,1,2,3
list-one,1,2,3,4
list-two,5,6,7,8
list-three,9,10,11,12
