 # Pandas Intro

In [1]:
import pandas as pd
import numpy as np

 ### Built-in Data Structures
 A `Series` has `array` and `index` attributes

In [2]:
test_obj = pd.Series([1, 2, -1, -8], index=["a", "b", "c", "d"])
test_obj

a    1
b    2
c   -1
d   -8
dtype: int64

In [3]:
# This is a PandasArray, it kinda wraps a NumPy array
test_obj.array

<PandasArray>
[1, 2, -1, -8]
Length: 4, dtype: int64

In [4]:
test_obj.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [5]:
test_obj["c"]

-1

In [6]:
test_obj["a"] = 3

In [7]:
test_obj[["a", "d"]]

a    3
d   -8
dtype: int64

 **Note**: Using NumPy-like operations, such as filtering, scalar multiplication, or applying math operations, will preserve the index-value link:

In [8]:
test_obj[test_obj > 0]

a    3
b    2
dtype: int64

In [9]:
test_obj * 2

a     6
b     4
c    -2
d   -16
dtype: int64

In [10]:
np.exp(test_obj)

a    20.085537
b     7.389056
c     0.367879
d     0.000335
dtype: float64

In [11]:
s_data = {"Assam": 2000, "Delhi": 7000, "Bengal": 5000, "Kerla": 4000}
obj0 = pd.Series(s_data)
obj0

Assam     2000
Delhi     7000
Bengal    5000
Kerla     4000
dtype: int64

In [12]:
# Convert it back to dict:
obj0.to_dict()

{'Assam': 2000, 'Delhi': 7000, 'Bengal': 5000, 'Kerla': 4000}

 **Note**: When only passing a dict in the `Series`, result will follow the key insertion order, we can override this by passing an `index=` param:

In [13]:
states = ["Kerla", "Goa", "Delhi", "Assam"]
obj1 = pd.Series(s_data, index=states)
obj1

Kerla    4000.0
Goa         NaN
Delhi    7000.0
Assam    2000.0
dtype: float64

 We can see one `NaN` value in our Series, `NaN` is Not a Number in pandas that marks missing data values.

In [14]:
obj1.isna()
# OR
# obj1.notna() # to get the opposite

Kerla    False
Goa       True
Delhi    False
Assam    False
dtype: bool

 Series automatically aligns by index label for arithmetic operations:

In [15]:
# Works kinda like the JOIN operations in SQL
obj2 = obj0 + obj1

 Both Series object & its index have a name property:

In [16]:
obj2.name = "Covid Cases"
obj2.index.name = "State"

In [17]:
test_obj.index = ["Rick", "Bilbo", "Thorin", "Elrond"]

 ### DataFrame

In [18]:
data_set = {
  "state": [
    "Utah", "California", "Ohio", "California", "Utah", "Ohio"
  ], 
  "year": [2000, 2001, 2002, 2001, 2002, 2003], 
  "pop": [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]
}

# Columns param rearranges the cols
df = pd.DataFrame(
  data_set, 
  columns=("year", "state", "pop", "debt")
)

In [19]:
# First 5 records
df.head()

Unnamed: 0,year,state,pop,debt
0,2000,Utah,1.5,
1,2001,California,1.7,
2,2002,Ohio,3.6,
3,2001,California,2.4,
4,2002,Utah,2.9,


In [20]:
# Last 5 records
df.tail()

Unnamed: 0,year,state,pop,debt
1,2001,California,1.7,
2,2002,Ohio,3.6,
3,2001,California,2.4,
4,2002,Utah,2.9,
5,2003,Ohio,3.2,


In [21]:
df.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

 The column can be accessed in a dict-like notation:

In [22]:
df["pop"]

0    1.5
1    1.7
2    3.6
3    2.4
4    2.9
5    3.2
Name: pop, dtype: float64

In [23]:
df["debt"] = (np.abs(np.random.standard_normal(len(df))) * 500).round(0).astype(np.uint64)

In [24]:
df["is_eastern"] = df["state"] == "Ohio"
df

In [25]:
# Delete col:
del df["is_eastern"]
df.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

In [26]:
p_data = {"Utah": {2000: 2.5, 2001: 1.1, 2002: 3.2}, "California": {2001: 3.7, 2002: 4.2}}

frame = pd.DataFrame(p_data)

In [27]:
frame.T  # Transpose

Unnamed: 0,2000,2001,2002
Utah,2.5,1.1,3.2
California,,3.7,4.2


In [28]:
# DataFrame doesn't have the name attribute.
frame.index.name = "year"
frame.columns.name = "state"

 We can convert a DataFrame to ndarray:

In [29]:
frame.to_numpy()

array([[2.5, nan],
       [1.1, 3.7],
       [3.2, 4.2]])

 ### Index Object

In [30]:
index_obj = pd.DataFrame(np.arange(3), index=["a", "b", "c"], columns=["test"])

 Both the index and columns are instances of Index Object

In [31]:
index_obj.index

Index(['a', 'b', 'c'], dtype='object')

In [32]:
index_obj.columns

Index(['test'], dtype='object')

 Index objects are immutable and thus can’t be modified by the user:

In [33]:
try:
  index_obj.index[0] = "z"
except Exception as e:
  print(e)

Index does not support mutable operations


 ### Reindexing

In [34]:
re_obj = pd.Series(np.random.standard_normal(3), index=("a", "b", "c"))

In [35]:
re_obj.reindex(["c", "a", "d"])

c    1.214998
a    0.260544
d         NaN
dtype: float64

In [36]:
colors = pd.Series(np.random.standard_normal(3), index=(0,2,3))

In [37]:
colors.reindex(np.arange(6), method="ffill")

0   -0.499660
1   -0.499660
2   -1.420593
3   -0.598103
4   -0.598103
5   -0.598103
dtype: float64

In [38]:
df_count = pd.DataFrame(np.random.standard_normal((3,3)), index=("a", "b", "c"), columns=("Goa", "Kerla", "Delhi"))
df_count = df_count * 50
cool_index = pd.Index(("b", "d", "a", "c"))

df_count.reindex(index=cool_index)
#OR
df_count.reindex(cool_index, axis="index")

Unnamed: 0,Goa,Kerla,Delhi
b,-115.668455,-3.459634,-91.496379
d,,,
a,7.933249,-38.680213,25.565475
c,120.29061,2.294133,18.028667


In [39]:
cool_states = pd.Index(["Goa", "Assam", "Kerla"])

df_count.reindex(columns=cool_states)
# OR
# { "index": 0, "columns": 1 }
df_count.reindex(cool_states, axis=1)

Unnamed: 0,Goa,Assam,Kerla
a,7.933249,,-38.680213
b,-115.668455,,-3.459634
c,120.29061,,2.294133


 **Note**: In the above examples, we used the `reindex` method to slice & insert either axes. If we only want to slice the `df` then we can use `loc` operator.

In [40]:
# This makes slicing super easy!
df_count.loc[["a", "b"], ["Goa", "Kerla"]]

Unnamed: 0,Goa,Kerla
a,7.933249,-38.680213
b,-115.668455,-3.459634


In [41]:
df_count.drop("c")

Unnamed: 0,Goa,Kerla,Delhi
a,7.933249,-38.680213,25.565475
b,-115.668455,-3.459634,-91.496379


In [42]:
df_count.drop(["c", "a"])

Unnamed: 0,Goa,Kerla,Delhi
b,-115.668455,-3.459634,-91.496379


In [43]:
df_count.drop(["Goa", "Delhi"], axis=1)

Unnamed: 0,Kerla
a,-38.680213
b,-3.459634
c,2.294133


 ### Indexing, Selection, & Filtering

In [44]:
# Series indexing works similar to NumPy indexing:
ser1 = pd.Series(np.arange(4), index=["a", "b", "c", "d"])

In [45]:
ser1["b"] == ser1[1]

True

In [46]:
ser1[:-1]

a    0
b    1
c    2
dtype: int64

In [47]:
ser1[["a", "c"]]

a    0
c    2
dtype: int64

In [48]:
ser1[ser1 > 1]

c    2
d    3
dtype: int64

 **Note**: While we can select data in this manner, the `loc` operator is preferred.

In [49]:
ser1.loc[["a", "d"]]

a    0
d    3
dtype: int64

In [50]:
ser1.loc[ser1 > 2]

d    3
dtype: int64

In [51]:
test_ser1 = pd.Series(np.arange(3), index=[1, 2, 0])
test_ser2 = pd.Series(np.arange(3), index=["m", "a", "d"])

test_ser1[[0, 2]]

0    2
2    1
dtype: int64

In [52]:
test_ser2[[0, 2]]

m    0
d    2
dtype: int64

In [53]:
test_ser1.loc[[0, 1]]

0    2
1    0
dtype: int64

In [54]:
# The loc operator works exclusively with labels, as opposed to the integers:
try:
  test_ser2.loc[[0, 1]]
except Exception as e:
  print(test_ser2.iloc[[0, 1]])

m    0
a    1
dtype: int64


 When slicing using loc, the last value is inclusive.

In [55]:
ser1.loc["a": "c"]

a    0
b    1
c    2
dtype: int64

In [56]:
ser1.loc["a": "b"] = 5

In [57]:
dt = pd.DataFrame(
  np.arange(16).reshape((4,4)), 
  index=["one", "two", "three", "four"], 
  columns=[
    "New York", "San Francisco", "Fargo", "Los Angeles"
  ]
).T

In [58]:
dt["two"]

New York         4
San Francisco    5
Fargo            6
Los Angeles      7
Name: two, dtype: int64

In [59]:
dt.loc["Fargo"]

one       2
two       6
three    10
four     14
Name: Fargo, dtype: int64

In [60]:
dt.sum(axis=1)

New York         24
San Francisco    28
Fargo            32
Los Angeles      36
dtype: int64

In [61]:
dt[["three", "one"]].sum(axis=1)

New York          8
San Francisco    10
Fargo            12
Los Angeles      14
dtype: int64

In [62]:
dt.iloc[2:]
# OR
dt["Fargo":]

Unnamed: 0,one,two,three,four
Fargo,2,6,10,14
Los Angeles,3,7,11,15


 **Note**: DataFrame's default `[]` operator:
 * When used for indexing accepts column names
 * While slicing accepts index names or ints

In [63]:
dt[dt["two"] > 5]

Unnamed: 0,one,two,three,four
Fargo,2,6,10,14
Los Angeles,3,7,11,15


In [64]:
dt[dt < 3] = np.nan

In [65]:
dt[dt.isna()] = dt.mean(axis=1).mean()

In [66]:
dt = dt.astype(np.int64)

 Using `loc`:

In [67]:
# Returns a Series when only one index:
dt.loc["New York", ["one", "four"]]

one      9
four    12
Name: New York, dtype: int64

In [68]:
dt.loc[
  ["Fargo", "Los Angeles"], 
  ["one", "three"]
]

Unnamed: 0,one,three
Fargo,9,10
Los Angeles,3,11


 Using `iloc`:

In [69]:
# Returns a Series when only one index:
dt.iloc[2, [0, 1]]

one    9
two    6
Name: Fargo, dtype: int64

In [70]:
dt.iloc[[1, 3], [2, 3, 0]]

Unnamed: 0,three,four,one
San Francisco,9,13,9
Los Angeles,11,15,3


 Boolean arrays can be used in `loc`, but not in `iloc`

In [71]:
dt.loc[dt["two"] > 5]

Unnamed: 0,one,two,three,four
Fargo,9,6,10,14
Los Angeles,3,7,11,15


In [72]:
try:
  dt.iloc[dt["one"] > 5]
except Exception as e:
  print(e)

iLocation based boolean indexing cannot use an indexable as a mask


In [73]:
dt.iloc[:, :2][dt["one"] < 5]

Unnamed: 0,one,two
Los Angeles,3,7


In [74]:
dt.at["Fargo", "three"]

10

In [75]:
dt.iat[2, 2]

10

 **Note**: To avoid ambiguity in code, always use the `loc` & `iloc` operators.

 ### Pitfalls with chaining indexing

In [76]:
dt.loc[:, "one"] = 1

In [77]:
dt.iloc[2] = 3

In [78]:
# Bad Practice:
# dt.loc[dt["three"] < 5]["three"] = 5
# Good Practice:
dt.loc[dt["three"] < 5, "three"] = 5

 Arithmetic Alignment

In [79]:
s1 = pd.Series(
  np.random.standard_normal(3).round(1)*8,
  index=list("acd")
)
s2 = pd.Series(
  np.random.standard_normal(4).round(1)*8,
  index=list("abcd")
)

In [80]:
s1+s2

a   -2.4
b    NaN
c    6.4
d   -4.0
dtype: float64

 The internal data alignment introduces missing values in the label locations that don’t overlap.

In [81]:
df1 = pd.DataFrame(
  (np.random.random((4,3)) * 10).round(1),
  columns=list("abc"),
  index=["Joe", "Mary", "Carl", "Vik"]
)
df2 = pd.DataFrame(
  (np.random.random((3,3)) * 10).round(1),
  columns=list("acd"),
  index=["Joe", "Noah", "Mary"]
)

In [82]:
df1+df2

Unnamed: 0,a,b,c,d
Carl,,,,
Joe,9.7,,16.2,
Mary,11.7,,7.1,
Noah,,,,
Vik,,,,


 Since column `"b"` and `"d"` don't appear in both DataFrame objects, all their results become `nan`. The same holds true for `Carl`, `Noah`, & `Vik`.

In [83]:
def rand_df(shape, columns, round=1):
  return pd.DataFrame(
    (np.random.random(shape) * 10).round(round),
    columns=columns,
  )

In [84]:
df_1 = rand_df((3,4), list("abcd"))
df_1

Unnamed: 0,a,b,c,d
0,1.6,7.5,5.7,8.7
1,0.3,7.9,4.3,4.4
2,9.5,4.0,7.2,8.4


In [85]:
df_2 = rand_df((4,5), list("abcde"))
df_2

Unnamed: 0,a,b,c,d,e
0,5.1,3.9,3.1,6.5,1.5
1,7.6,2.5,8.1,0.2,2.6
2,8.7,1.3,1.5,2.5,8.8
3,5.8,3.6,4.8,3.4,6.4


In [86]:
df_1+df_2

Unnamed: 0,a,b,c,d,e
0,6.7,11.4,8.8,15.2,
1,7.9,10.4,12.4,4.6,
2,18.2,5.3,8.7,10.9,
3,,,,,


In [87]:
df_1.add(df_2, fill_value=0)

Unnamed: 0,a,b,c,d,e
0,6.7,11.4,8.8,15.2,1.5
1,7.9,10.4,12.4,4.6,2.6
2,18.2,5.3,8.7,10.9,8.8
3,5.8,3.6,4.8,3.4,6.4


 The `add` method is one of the several arithmentic operations available in the pandas library.

 In simple words, it adds two `DataFrame` objects together, similar to the simple `+` operator.

 The `fill_value` parameter is important: by default is None, optionally takes a float value. Fill the existing missing (NaN) values, and any new element needed for successful DataFrame alignment.

In [88]:
# fill_value=1, 1 * num == num
df_1.mul(df_2, fill_value=1)

Unnamed: 0,a,b,c,d,e
0,8.16,29.25,17.67,56.55,1.5
1,2.28,19.75,34.83,0.88,2.6
2,82.65,5.2,10.8,21.0,8.8
3,5.8,3.6,4.8,3.4,6.4


In [89]:
a = np.arange(12.0).reshape((3, 4))
a[0]

array([0., 1., 2., 3.])

In [90]:
a - a[0]
# The substraction is performed on all the rows, this is called as broadcasting.

array([[0., 0., 0., 0.],
       [4., 4., 4., 4.],
       [8., 8., 8., 8.]])

In [91]:
dt_frame = pd.DataFrame(
  np.arange(12.).reshape((3,4)), columns=list("abcd"),index=cool_states
)
dt_frame

Unnamed: 0,a,b,c,d
Goa,0.0,1.0,2.0,3.0
Assam,4.0,5.0,6.0,7.0
Kerla,8.0,9.0,10.0,11.0


In [92]:
ser1 = dt_frame.iloc[0]
ser1

a    0.0
b    1.0
c    2.0
d    3.0
Name: Goa, dtype: float64

In [93]:
dt_frame - ser1

Unnamed: 0,a,b,c,d
Goa,0.0,0.0,0.0,0.0
Assam,4.0,4.0,4.0,4.0
Kerla,8.0,8.0,8.0,8.0


In [94]:
dt_frame

Unnamed: 0,a,b,c,d
Goa,0.0,1.0,2.0,3.0
Assam,4.0,5.0,6.0,7.0
Kerla,8.0,9.0,10.0,11.0


In [95]:
ser2 = dt_frame["a"]
dt_frame.sub(ser2, axis=0)

Unnamed: 0,a,b,c,d
Goa,0.0,1.0,2.0,3.0
Assam,0.0,1.0,2.0,3.0
Kerla,0.0,1.0,2.0,3.0


In [96]:
ser3 = pd.Series(np.array([4, 2, 3, 5]), index=list("abcd"))
dt_frame.mul(ser3, axis=1)

Unnamed: 0,a,b,c,d
Goa,0.0,2.0,6.0,15.0
Assam,16.0,10.0,18.0,35.0
Kerla,32.0,18.0,30.0,55.0


In [97]:
df_3 = pd.DataFrame((np.random.standard_normal((4, 3)) * 5).round(2),columns=list("bde"),index=["Utah", "Ohio", "Texas", "Oregon"])
df_3

Unnamed: 0,b,d,e
Utah,7.57,6.68,-2.92
Ohio,7.67,-6.67,-0.37
Texas,-7.57,8.53,-3.12
Oregon,2.97,7.47,7.5


In [98]:
df_3.apply(np.std, axis=1)

Utah      4.749178
Ohio      5.868628
Texas     6.788348
Oregon    2.128427
dtype: float64

In [99]:
df_3.std(axis=1)

Utah      5.816531
Ohio      7.187573
Texas     8.313994
Oregon    2.606780
dtype: float64

 NumPy uses `ddof=0`

 ### Function Application & Mapping

 We can use the `apply` method to apply a custom or pre-built funcion. Most common mathematical functions are built-in, so the use of this is rare.

In [100]:
def minmax_diff(x):
  """
  A custom function

  Args:
    x: Series
    
  Returns: Difference of max & min element
  """
  return x.max() - x.min()

In [101]:
df_3.apply(minmax_diff)

b    15.24
d    15.20
e    10.62
dtype: float64

In [102]:
df_3.apply(minmax_diff, axis=1)

Utah      10.49
Ohio      14.34
Texas     16.10
Oregon     4.53
dtype: float64

In [103]:
# We can even pass the function as a lambda fn:
df_3.apply(lambda x: x.max() - x.min())

b    15.24
d    15.20
e    10.62
dtype: float64

In [104]:
def both_minmax(x):
  return pd.Series([x.min(), x.max()], index=("min", "max"))

In [105]:
df_3.apply(both_minmax)

Unnamed: 0,b,d,e
min,-7.57,-6.67,-3.12
max,7.67,8.53,7.5


In [106]:
df_3.apply(both_minmax, axis=1)

Unnamed: 0,min,max
Utah,-2.92,7.57
Ohio,-6.67,7.67
Texas,-7.57,8.53
Oregon,2.97,7.5


 For an element-wise fn, use `applymap` function.

In [107]:
df_3.applymap(lambda x: f"{x:.1f}")

Unnamed: 0,b,d,e
Utah,7.6,6.7,-2.9
Ohio,7.7,-6.7,-0.4
Texas,-7.6,8.5,-3.1
Oregon,3.0,7.5,7.5


 `map` is a array/Series method which iterates over the elements & applies the fn

In [108]:
df_3["e"].map(lambda x: int(x))

Utah     -2
Ohio      0
Texas    -3
Oregon    7
Name: e, dtype: int64

 ### Sorting & Ranking

In [109]:
ser_s = pd.Series(np.random.standard_normal(4)*8, index=("d", "a", "c", "b"))
ser_s

d   -7.534639
a   -8.273436
c    9.044045
b   -3.125442
dtype: float64

In [110]:
ser_s.sort_index()

a   -8.273436
b   -3.125442
c    9.044045
d   -7.534639
dtype: float64

In [111]:
# Decending order:
ser_s.sort_index(ascending=False)

d   -7.534639
c    9.044045
b   -3.125442
a   -8.273436
dtype: float64

In [112]:
ser_s.sort_values()

a   -8.273436
d   -7.534639
b   -3.125442
c    9.044045
dtype: float64

In [113]:
df_s = pd.DataFrame(np.random.standard_normal((2,3)) * 8, index=["two", "one"], columns=list("cab"))
df_s

Unnamed: 0,c,a,b
two,-0.190806,14.863309,0.808571
one,-0.800211,-13.055976,6.132208


In [114]:
df_s.sort_index(axis=1, ascending=False)

Unnamed: 0,c,b,a
two,-0.190806,0.808571,14.863309
one,-0.800211,6.132208,-13.055976


In [115]:
df_s.sort_index()

Unnamed: 0,c,a,b
one,-0.800211,-13.055976,6.132208
two,-0.190806,14.863309,0.808571


In [116]:
ser_s["c"] = np.nan
ser_s

d   -7.534639
a   -8.273436
c         NaN
b   -3.125442
dtype: float64

In [117]:
# na_position: {‘first’, ‘last’}, default ‘last’
ser_s.sort_values(na_position="first")

c         NaN
a   -8.273436
d   -7.534639
b   -3.125442
dtype: float64

In [118]:
df_s1 = pd.DataFrame({
  "a": [0,1,5,0],
  "b": [6,-3,0,4]
})
df_s1

Unnamed: 0,a,b
0,0,6
1,1,-3
2,5,0
3,0,4


In [119]:
df_s1.sort_values("a")

Unnamed: 0,a,b
0,0,6
3,0,4
1,1,-3
2,5,0


In [120]:
df_s1.sort_values(["a","b"])

Unnamed: 0,a,b
3,0,4
0,0,6
1,1,-3
2,5,0


In [121]:
ser_r = pd.Series([2, 4, -3, 4, 5, 2])

In [122]:
ser_r.rank()

0    2.5
1    4.5
2    1.0
3    4.5
4    6.0
5    2.5
dtype: float64

In [123]:
# Same values get a average rank!
# There are multiple different methods of assigning the ranks
ser_r.rank(method="first")

0    2.0
1    4.0
2    1.0
3    5.0
4    6.0
5    3.0
dtype: float64

In [124]:
ser_r.rank(method="dense")

0    2.0
1    3.0
2    1.0
3    3.0
4    4.0
5    2.0
dtype: float64

In [125]:
df_r = pd.DataFrame({
  "a": (np.random.standard_normal(4)*8).round(0),
  "b": (np.random.standard_normal(4)*8).round(0),
  "c": (np.random.standard_normal(4)*8).round(0),
})
df_r

Unnamed: 0,a,b,c
0,-5.0,16.0,-2.0
1,4.0,-20.0,12.0
2,-13.0,-0.0,-9.0
3,-9.0,15.0,6.0


In [126]:
df_r.rank(axis=1, method="min")

Unnamed: 0,a,b,c
0,1.0,3.0,2.0
1,2.0,1.0,3.0
2,1.0,3.0,2.0
3,1.0,3.0,2.0


 ### Duplicate Labels

In [127]:
dup = pd.Series(np.arange(5), index=["a", "a", "b", "b", "c"])

In [128]:
dup.index.is_unique

False

In [129]:
dup["b"]

b    2
b    3
dtype: int64

In [130]:
dup_df = pd.DataFrame(np.arange(15).reshape((5,3)), index=["a", "a", "b", "c", "b"])

In [131]:
dup_df = dup_df.sort_index()

In [132]:
dup_df.loc["a"]

Unnamed: 0,0,1,2
a,0,1,2
a,3,4,5


In [133]:
dup_df.loc["c"]

0     9
1    10
2    11
Name: c, dtype: int64