In [2]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Panda Series

## Constructor

In [10]:
data_list = ["East", "West", "North", "south"]

data_dictionary = {"Grade-1": 25, "Grade-2": 40, "Grade-3": 30, "Grade-4": 28, "Grade-5": 40}

# Create empty Serie
s_empty = pd.Series()
print(f"Empty serie:\n{s_empty}\n")

# Create Serie from List
s_from_list = pd.Series(data=data_list)
print(f"Serie from List:\n{s_from_list}\n")

# Create Serie from Dict
s_from_dict = pd.Series(data=data_dictionary)
print(f"Serie from Dict:\n{s_from_dict}\n")

# Create Serie with custom index
s_custom_index = pd.Series(data=data_list, index=["E","W","S","N"])
print(f"Serie with custom index:\n{s_custom_index}\n")


Empty serie:
Series([], dtype: object)

Serie from List:
0     East
1     West
2    North
3    south
dtype: object

Serie from Dict:
Grade-1    25
Grade-2    40
Grade-3    30
Grade-4    28
Grade-5    40
dtype: int64

Serie with custom index:
E     East
W     West
S    North
N    south
dtype: object



## Attributes

index - serie index
values - numpy array (ndarray)
dtype - data type of element in the serie
shape - as tuple
size - number of elements in serie
nbytes - memory usage
name - serie label
empty - boolean
hasnans - if NaN (null) values are present

In [2]:
emp_name = ["Bob", "Mike", "Mary", "Anna", "Dave"]
emp_age = [25, 28, 34, 42, 56]

s = pd.Series(data=emp_age, index=emp_name, name="Employee_Information")

s

Bob     25
Mike    28
Mary    34
Anna    42
Dave    56
Name: Employee_Information, dtype: int64

In [4]:
print(f"Index: {s.index} type -> {type(s.index)}")
print(f"Values: {s.values} type -> {type(s.values)}")
print(f"Data type: {s.dtype} type -> {type(s.dtype)}")
print(f"Shape: {s.shape} type -> {type(s.shape)}")
print(f"Size: {s.size} type -> {type(s.size)}")
print(f"Bytes: {s.nbytes} type -> {type(s.nbytes)}")
print(f"Name: {s.name} type -> {type(s.name)}")
print(f"Is empty: {s.empty} type -> {type(s.empty)}")
print(f"Has NaN: {s.hasnans} type -> {type(s.hasnans)}")


Index: Index(['Bob', 'Mike', 'Mary', 'Anna', 'Dave'], dtype='object') type -> <class 'pandas.core.indexes.base.Index'>
Values: [25 28 34 42 56] type -> <class 'numpy.ndarray'>
Data type: int64 type -> <class 'numpy.dtypes.Int64DType'>
Shape: (5,) type -> <class 'tuple'>
Size: 5 type -> <class 'int'>
Bytes: 40 type -> <class 'int'>
Name: Employee_Information type -> <class 'str'>
Is empty: False type -> <class 'bool'>
Has NaN: False type -> <class 'bool'>


## Index

In [14]:
grades = ["Grade-1", "Grade-2", "Grade-3", "Grade-4", "Grade-5", "Grade-6"]
studenst = [1, 2, 4, 20, 15, 1]

s = pd.Series(data=studenst, index=grades, name="Student grades")

s

Grade-1     1
Grade-2     2
Grade-3     4
Grade-4    20
Grade-5    15
Grade-6     1
Name: Student grades, dtype: int64

In [17]:
# retrieving multiple elements based on labels (index)

lb_to_get = ["Grade-1", "Grade-5"]
lb_data = s.loc[lb_to_get]

print(f"Type: {type(lb_data)}")

lb_data

Type: <class 'pandas.core.series.Series'>


Grade-1     1
Grade-5    15
Name: Student grades, dtype: int64

In [19]:
# retrieving single element based on labels (index)

lb_data = s.at["Grade-6"]

print(f"Type: {type(lb_data)}")

lb_data

Type: <class 'numpy.int64'>


1

In [20]:
# retrieving multiple element based on position

pb_data = s.iloc[1:3]

print(f"Type: {type(pb_data)}")

pb_data

Type: <class 'pandas.core.series.Series'>


Grade-2    2
Grade-3    4
Name: Student grades, dtype: int64

In [21]:
# retrieving single element based on position

pb_data = s.iat[5]

print(f"Type: {type(pb_data)}")

pb_data

Type: <class 'numpy.int64'>


1

## MultiIndex

In [37]:
index = [
    ("CA", 2000), ("CA", 2010), ("CA", 2020),
    ("TX", 2000), ("TX", 2010), ("TX", 2020),
    ("FL", 2000), ("FL", 2010), ("FL", 2020),
]
# index = [
#     ("CA", 2000), ("TX", 2000), ("FL", 2000),
#     ("CA", 2010), ("TX", 2010), ("FL", 2010),
#     ("CA", 2020), ("TX", 2020), ("FL", 2020),
# ]

population = [25, 26, 27, 40, 42, 44, 10, 11, 12]

multindex = pd.MultiIndex.from_tuples(index)

s = pd.Series(data=population, index=multindex)

s

CA  2000    25
    2010    26
    2020    27
TX  2000    40
    2010    42
    2020    44
FL  2000    10
    2010    11
    2020    12
dtype: int64

In [38]:
# retrieving data by labels of first level of MultiIndex
print(type(s.loc["CA"]))

s.loc["CA"]

<class 'pandas.core.series.Series'>


2000    25
2010    26
2020    27
dtype: int64

In [39]:
# retrieving data by labels of second level of MultiIndex
print(type(s.loc[:, 2000]))

s.loc[:, 2000]

<class 'pandas.core.series.Series'>


CA    25
TX    40
FL    10
dtype: int64

In [40]:
# retrieving data by labels of second level of MultiIndex
print(type(s.loc["CA", 2000]))

s.loc[["CA"], 2000]

<class 'numpy.int64'>


CA  2000    25
dtype: int64

## Boolean slicing

In [47]:
data = {"Population": [8.5, 3.9, 2.3, 1.6, 2.7, 1.5, 1.4, 1.6]}
custom_index = ["New York", "Los Angeles", "Houston", "Phoenix", "Chicago", "San Antonio", "San Diego", "Philadelphia"]

s = pd.Series(data=data["Population"], index=custom_index)

s

New York        8.5
Los Angeles     3.9
Houston         2.3
Phoenix         1.6
Chicago         2.7
San Antonio     1.5
San Diego       1.4
Philadelphia    1.6
dtype: float64

In [50]:
# create new boolean Serie based on selection
sel = (s > 2) & (s < 5)

print(type(sel))

sel

<class 'pandas.core.series.Series'>


New York        False
Los Angeles      True
Houston          True
Phoenix         False
Chicago          True
San Antonio     False
San Diego       False
Philadelphia    False
dtype: bool

In [51]:
# slice using boolean Serie to select result Serie
result = s[sel]

result

Los Angeles    3.9
Houston        2.3
Chicago        2.7
dtype: float64

## Slicing

In [52]:
data = {"Population": [8.5, 3.9, 2.3, 1.6, 2.7, 1.5, 1.4, 1.6]}
custom_index = ["New York", "Los Angeles", "Houston", "Phoenix", "Chicago", "San Antonio", "San Diego", "Philadelphia"]

s = pd.Series(data=data["Population"], index=custom_index)

s

New York        8.5
Los Angeles     3.9
Houston         2.3
Phoenix         1.6
Chicago         2.7
San Antonio     1.5
San Diego       1.4
Philadelphia    1.6
dtype: float64

In [59]:
# Positional index slicing

sub = s[1:5]
print(sub)

# same as
sub1 = s.iloc[1:5]
print(sub1)


Los Angeles    3.9
Houston        2.3
Phoenix        1.6
Chicago        2.7
dtype: float64
Los Angeles    3.9
Houston        2.3
Phoenix        1.6
Chicago        2.7
dtype: float64


In [60]:
# Label index slicing

sub = s["Los Angeles":"Chicago"]
print(sub)

# same as
sub1 = s.loc["Los Angeles":"Chicago"]
print(sub1)

Los Angeles    3.9
Houston        2.3
Phoenix        1.6
Chicago        2.7
dtype: float64
Los Angeles    3.9
Houston        2.3
Phoenix        1.6
Chicago        2.7
dtype: float64


In [61]:
# Label index slicing using step

sub = s["New York":"Philadelphia":2]
print(sub)

# same as
sub1 = s.loc["New York":"Philadelphia":2]
print(sub1)

New York     8.5
Houston      2.3
Chicago      2.7
San Diego    1.4
dtype: float64
New York     8.5
Houston      2.3
Chicago      2.7
San Diego    1.4
dtype: float64


# Pandas DataFrame

## One level indexing

In [110]:
data = {
    "Customer": ["John", "Jane", "Bob", "Alice"],
    "Product": ["Laptop", "Keyboard", "Mouse", "Printer"],
    "Sales Amount": [1100, 100, 40, 700]
}

custom_index = [101, 102, 103, 104]

df = pd.DataFrame(data,index=custom_index)

df

Unnamed: 0,Customer,Product,Sales Amount
101,John,Laptop,1100
102,Jane,Keyboard,100
103,Bob,Mouse,40
104,Alice,Printer,700


In [111]:
# select 1 row
s = df.loc[101, :]

print(type(s))
print(r)

# same as
s = df.iloc[0, :]

print(type(s))
print(s)


<class 'pandas.core.series.Series'>
      Product  Sales Amount
102  Keyboard           100
103     Mouse            40
<class 'pandas.core.series.Series'>
Customer          John
Product         Laptop
Sales Amount      1100
Name: 101, dtype: object


In [112]:
# select multiple rows
s = df.loc[101:103, :]

print(type(s))
print(s)

# same as
s = df.iloc[0:3, :]

print(type(s))
print(s)


<class 'pandas.core.frame.DataFrame'>
    Customer   Product  Sales Amount
101     John    Laptop          1100
102     Jane  Keyboard           100
103      Bob     Mouse            40
<class 'pandas.core.frame.DataFrame'>
    Customer   Product  Sales Amount
101     John    Laptop          1100
102     Jane  Keyboard           100
103      Bob     Mouse            40


In [113]:
# select 1 column
s = df.loc[:, "Product"]

print(type(s))
print(s)

# same as
s = df.iloc[:, 1]

print(type(s))
print(s)

<class 'pandas.core.series.Series'>
101      Laptop
102    Keyboard
103       Mouse
104     Printer
Name: Product, dtype: object
<class 'pandas.core.series.Series'>
101      Laptop
102    Keyboard
103       Mouse
104     Printer
Name: Product, dtype: object


In [114]:
# select multiple columns
r = df.loc[:, ["Product", "Sales Amount"]]

print(type(s))
print(s)

# same as
s = df.iloc[:, 1:3]

print(type(s))
print(s)

<class 'pandas.core.series.Series'>
101      Laptop
102    Keyboard
103       Mouse
104     Printer
Name: Product, dtype: object
<class 'pandas.core.frame.DataFrame'>
      Product  Sales Amount
101    Laptop          1100
102  Keyboard           100
103     Mouse            40
104   Printer           700


In [115]:
# slice table by rows and columns
s = df.loc[[102, 103], ["Product", "Sales Amount"]]

print(type(s))
print(s)

# same as
s = df.iloc[1:3, 1:3]

print(type(s))
print(s)

<class 'pandas.core.frame.DataFrame'>
      Product  Sales Amount
102  Keyboard           100
103     Mouse            40
<class 'pandas.core.frame.DataFrame'>
      Product  Sales Amount
102  Keyboard           100
103     Mouse            40


## Multilevel indexing

In [116]:
data = {
    "Customer": ["John", "Jane", "Bob", "Alice", "Charlie", "Amy", "Carl", "Kate"],
    "Product": ["Laptop", "Keyboard", "Mouse", "Printer", "Monitor", "Printer", "Mouse", "Monitor"],
    "Sales Amount": [1100, 100, 40, 700, 400, 200, 20, 650]
}

regions = ["North", "South"]
sales_ids = [101, 102, 103, 104]

custom_index = pd.MultiIndex.from_product([regions, sales_ids], names = ["Region", "Sales ID"])

df = pd.DataFrame(data,index=custom_index)

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer,Product,Sales Amount
Region,Sales ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
North,101,John,Laptop,1100
North,102,Jane,Keyboard,100
North,103,Bob,Mouse,40
North,104,Alice,Printer,700
South,101,Charlie,Monitor,400
South,102,Amy,Printer,200
South,103,Carl,Mouse,20
South,104,Kate,Monitor,650


In [145]:
# using iloc

s = df.iloc[[0,1,2,3], :]
# s = df.iloc[range(4), :]
# s = df.iloc[slice(0,4), :]
print(type(s))
print(s)


<class 'pandas.core.frame.DataFrame'>
                Customer   Product  Sales Amount
Region Sales ID                                 
North  101          John    Laptop          1100
       102          Jane  Keyboard           100
       103           Bob     Mouse            40
       104         Alice   Printer           700


In [146]:
# select by first level in multiindex

s = df.loc["North", :]
print(type(s))
print(s)

<class 'pandas.core.frame.DataFrame'>
         Customer   Product  Sales Amount
Sales ID                                 
101          John    Laptop          1100
102          Jane  Keyboard           100
103           Bob     Mouse            40
104         Alice   Printer           700


In [147]:
# select by all levels in multiindex
# from - to like
s = df.loc[("North",102):("South",104), :]
print(type(s))
print(s)

# no result
s = df.loc[("South",102):("North",104), :]
print(type(s))
print(s)


<class 'pandas.core.frame.DataFrame'>
                Customer   Product  Sales Amount
Region Sales ID                                 
North  102          Jane  Keyboard           100
       103           Bob     Mouse            40
       104         Alice   Printer           700
South  101       Charlie   Monitor           400
       102           Amy   Printer           200
       103          Carl     Mouse            20
       104          Kate   Monitor           650
<class 'pandas.core.frame.DataFrame'>
Empty DataFrame
Columns: [Customer, Product, Sales Amount]
Index: []


In [148]:
# select by all levels in multiindex
# value list for filter (cartesian product)

s = df.loc[(["North", "South"], slice(102,104)), :]
print(type(s))
print(s)

# different order in the result
s = df.loc[(["South", "North"], slice(102,104)), :]
print(type(s))
print(s)


<class 'pandas.core.frame.DataFrame'>
                Customer   Product  Sales Amount
Region Sales ID                                 
North  102          Jane  Keyboard           100
       103           Bob     Mouse            40
       104         Alice   Printer           700
South  102           Amy   Printer           200
       103          Carl     Mouse            20
       104          Kate   Monitor           650
<class 'pandas.core.frame.DataFrame'>
                Customer   Product  Sales Amount
Region Sales ID                                 
South  102           Amy   Printer           200
       103          Carl     Mouse            20
       104          Kate   Monitor           650
North  102          Jane  Keyboard           100
       103           Bob     Mouse            40
       104         Alice   Printer           700


In [156]:
# select by last levels in multiindex
s = df.loc[(slice(None), 101), :]
print(type(s))
print(s)


<class 'pandas.core.frame.DataFrame'>
                Customer  Product  Sales Amount
Region Sales ID                                
North  101          John   Laptop          1100
South  101       Charlie  Monitor           400


## Setting and resetting index

In [157]:
data = {
    "ID": [101, 102, 103],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
}

df = pd.DataFrame(data)

df

Unnamed: 0,ID,Name,Age
0,101,Alice,25
1,102,Bob,30
2,103,Charlie,35


In [158]:
df_indexed = df.set_index(keys="ID", drop=True, append=False, inplace=False, verify_integrity=False)

df_indexed

Unnamed: 0_level_0,Name,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
101,Alice,25
102,Bob,30
103,Charlie,35


In [159]:
df_indexed = df.set_index(keys=["ID","Name"], drop=True, append=False, inplace=False, verify_integrity=False)

df_indexed

Unnamed: 0_level_0,Unnamed: 1_level_0,Age
ID,Name,Unnamed: 2_level_1
101,Alice,25
102,Bob,30
103,Charlie,35


In [160]:
df_reset = df_indexed.reset_index(level="Name")

df_reset

Unnamed: 0_level_0,Name,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
101,Alice,25
102,Bob,30
103,Charlie,35


In [161]:
df_reset.reset_index(inplace=True)

df_reset

Unnamed: 0,ID,Name,Age
0,101,Alice,25
1,102,Bob,30
2,103,Charlie,35


## Reshaping DataFrame

In [209]:
# stacking
# from single level columns
# return Series

data = {
    "ID": [101, 102, 103],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
}

df = pd.DataFrame(data)
print(type(df))

print(df)

df_stacked = df.stack()

print(type(df_stacked))

print(df_stacked)


<class 'pandas.core.frame.DataFrame'>
    ID     Name  Age
0  101    Alice   25
1  102      Bob   30
2  103  Charlie   35
<class 'pandas.core.series.Series'>
0  ID          101
   Name      Alice
   Age          25
1  ID          102
   Name        Bob
   Age          30
2  ID          103
   Name    Charlie
   Age          35
dtype: object


In [211]:
# unstacking
df_unstacked = df_stacked.unstack(future_stack=True)

df_unstacked

Unnamed: 0,ID,Name,Age
0,101,Alice,25
1,102,Bob,30
2,103,Charlie,35


In [214]:
# stacking
# from multi level columns
# reurn DataFrame

data = [
    [101, "Alice", 25],
    [102, "Bob", 30],
    [103, "Charlie", 35],
]

multicol = pd.MultiIndex.from_tuples([('Info', 'ID'),('Info', 'Name'),('Info', 'Age')])
print(multicol)

df = pd.DataFrame(data, columns=multicol)
print(type(df))

print(df)

df_stacked = df.stack(future_stack=True)

print(type(df_stacked))
print(df_stacked)


MultiIndex([('Info',   'ID'),
            ('Info', 'Name'),
            ('Info',  'Age')],
           )
<class 'pandas.core.frame.DataFrame'>
  Info             
    ID     Name Age
0  101    Alice  25
1  102      Bob  30
2  103  Charlie  35
<class 'pandas.core.frame.DataFrame'>
           Info
0 ID        101
  Name    Alice
  Age        25
1 ID        102
  Name      Bob
  Age        30
2 ID        103
  Name  Charlie
  Age        35


In [215]:
# unstacking
df_unstacked = df_stacked.unstack()

df_unstacked

Unnamed: 0_level_0,Info,Info,Info
Unnamed: 0_level_1,ID,Name,Age
0,101,Alice,25
1,102,Bob,30
2,103,Charlie,35


## Reindexing

In [217]:
data = {
    "ID": [101, 102, 103],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
}

df = pd.DataFrame(data)

df.set_index("ID", inplace=True)

df


Unnamed: 0_level_0,Name,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
101,Alice,25
102,Bob,30
103,Charlie,35


In [219]:
# reindex

new_index = [100, 101, 103, 104, 105]

df_reindex = df.reindex(new_index)

df_reindex

Unnamed: 0_level_0,Name,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
100,,
101,Alice,25.0
103,Charlie,35.0
104,,
105,,


## Sorting DataFrame by values

In [15]:
data = {
    "Sales ID": [101, 102, 103, 104, 105, 106, 107, 108],
    "Customer": ["John", "Jane", "Bob", "Alice", "Charlie", "Amy", "Carl", "Kate"],
    "Sales Person": ["Emma", "Frank", "Emma", "Alice", "Frank", "Chris", "Kate", "Alice"],
    "Product": ["Laptop", "Keyboard", "Mouse", "Printer", "Monitor", "Printer", "Mouse", "Monitor"],
    "Sales Amount": [1100, 100, 40, 700, 400, 100, 20, 700]
}

df = pd.DataFrame(data)

df = df.set_index("Sales ID")

df

Unnamed: 0_level_0,Customer,Sales Person,Product,Sales Amount
Sales ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
101,John,Emma,Laptop,1100
102,Jane,Frank,Keyboard,100
103,Bob,Emma,Mouse,40
104,Alice,Alice,Printer,700
105,Charlie,Frank,Monitor,400
106,Amy,Chris,Printer,100
107,Carl,Kate,Mouse,20
108,Kate,Alice,Monitor,700


In [16]:
# sort by single column
df_sorted = df.sort_values(by="Sales Amount")

df_sorted

Unnamed: 0_level_0,Customer,Sales Person,Product,Sales Amount
Sales ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107,Carl,Kate,Mouse,20
103,Bob,Emma,Mouse,40
102,Jane,Frank,Keyboard,100
106,Amy,Chris,Printer,100
105,Charlie,Frank,Monitor,400
104,Alice,Alice,Printer,700
108,Kate,Alice,Monitor,700
101,John,Emma,Laptop,1100


In [17]:
# sort by single column
df_sorted = df.sort_values(by=["Sales Amount", "Customer"])

df_sorted

Unnamed: 0_level_0,Customer,Sales Person,Product,Sales Amount
Sales ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
107,Carl,Kate,Mouse,20
103,Bob,Emma,Mouse,40
106,Amy,Chris,Printer,100
102,Jane,Frank,Keyboard,100
105,Charlie,Frank,Monitor,400
104,Alice,Alice,Printer,700
108,Kate,Alice,Monitor,700
101,John,Emma,Laptop,1100


## Sorting DataFrame by indexes

In [23]:
data = {
    "Sales ID": [105, 108, 104, 101, 103, 106, 107, 102],
    "Location": ["CA", "NY", "CA", "NJ", "NY", "PA", "NJ", "NY"],
    "Customer": ["John", "Jane", "Bob", "Alice", "Charlie", "Amy", "Carl", "Kate"],
    "Sales Person": ["Emma", "Frank", "Emma", "Alice", "Frank", "Chris", "Kate", "Alice"],
    "Product": ["Laptop", "Keyboard", "Mouse", "Printer", "Monitor", "Printer", "Mouse", "Monitor"],
    "Sales Amount": [1100, 100, 40, 700, 400, 100, 20, 700]
}

df = pd.DataFrame(data)

df = df.set_index(["Location", "Sales ID"])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer,Sales Person,Product,Sales Amount
Location,Sales ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,105,John,Emma,Laptop,1100
NY,108,Jane,Frank,Keyboard,100
CA,104,Bob,Emma,Mouse,40
NJ,101,Alice,Alice,Printer,700
NY,103,Charlie,Frank,Monitor,400
PA,106,Amy,Chris,Printer,100
NJ,107,Carl,Kate,Mouse,20
NY,102,Kate,Alice,Monitor,700


In [24]:
# sort by single index level
df_sorted = df.sort_index(level = "Location")

df_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer,Sales Person,Product,Sales Amount
Location,Sales ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,104,Bob,Emma,Mouse,40
CA,105,John,Emma,Laptop,1100
NJ,101,Alice,Alice,Printer,700
NJ,107,Carl,Kate,Mouse,20
NY,102,Kate,Alice,Monitor,700
NY,103,Charlie,Frank,Monitor,400
NY,108,Jane,Frank,Keyboard,100
PA,106,Amy,Chris,Printer,100


In [26]:
# sort by all index level
df_sorted = df.sort_index(ascending=[True, False])

df_sorted

Unnamed: 0_level_0,Unnamed: 1_level_0,Customer,Sales Person,Product,Sales Amount
Location,Sales ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CA,105,John,Emma,Laptop,1100
CA,104,Bob,Emma,Mouse,40
NJ,107,Carl,Kate,Mouse,20
NJ,101,Alice,Alice,Printer,700
NY,108,Jane,Frank,Keyboard,100
NY,103,Charlie,Frank,Monitor,400
NY,102,Kate,Alice,Monitor,700
PA,106,Amy,Chris,Printer,100


## Combining DataFrames

In [52]:
emp_data = {
    "Emp ID": [1, 2, 3, 4, 5],
    "Name": ["John", "Amy", "Bob", "Charlie", "David"],
    "Department": ["HR", "IT", "Finance", "IT", "Marketing"],
    "Role": ["Manager", "Developer", "Analyst", "Developer", "Coordinator"],
}

emp_project = {
    "Emp ID": [1, 2, 3, 1, 4, 3],
    "Project ID": ["P101", "P101", "P101", "P102", "P102", "P102"],
    "Role": ["Project Manager", "Developer", "Analyst", "Project Manager", "Developer", "Analyst"]
}


In [53]:
# option 1
# df_emp = pd.DataFrame(data={k: v for k, v in emp_data.items() if k != "ID"} , index=emp_data["ID"])
# option 2
df_emp = pd.DataFrame(emp_data)
df_emp = df_emp.set_index("Emp ID")

df_emp

Unnamed: 0_level_0,Name,Department,Role
Emp ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,John,HR,Manager
2,Amy,IT,Developer
3,Bob,Finance,Analyst
4,Charlie,IT,Developer
5,David,Marketing,Coordinator


In [54]:
df_emp_project = pd.DataFrame(emp_project)

df_emp_project

Unnamed: 0,Emp ID,Project ID,Role
0,1,P101,Project Manager
1,2,P101,Developer
2,3,P101,Analyst
3,1,P102,Project Manager
4,4,P102,Developer
5,3,P102,Analyst


In [55]:
# combine using merge
merged_df = df_emp_project.merge(right=df_emp, on="Emp ID", how="left")

merged_df

Unnamed: 0,Emp ID,Project ID,Role_x,Name,Department,Role_y
0,1,P101,Project Manager,John,HR,Manager
1,2,P101,Developer,Amy,IT,Developer
2,3,P101,Analyst,Bob,Finance,Analyst
3,1,P102,Project Manager,John,HR,Manager
4,4,P102,Developer,Charlie,IT,Developer
5,3,P102,Analyst,Bob,Finance,Analyst


In [56]:
# combine using join
joined_df = df_emp_project.join(other=df_emp, on="Emp ID", how="left", lsuffix="_project", rsuffix="_emp")

joined_df

Unnamed: 0,Emp ID,Project ID,Role_project,Name,Department,Role_emp
0,1,P101,Project Manager,John,HR,Manager
1,2,P101,Developer,Amy,IT,Developer
2,3,P101,Analyst,Bob,Finance,Analyst
3,1,P102,Project Manager,John,HR,Manager
4,4,P102,Developer,Charlie,IT,Developer
5,3,P102,Analyst,Bob,Finance,Analyst


## Time Series

### Create timestamp

In [63]:
data = {
    "temperature": [23.3, 24.4, 22.1, 22, 31.3],
    "humidity": [43, 44, 50, 44, 46],
}

df = pd.DataFrame(data)

df


Unnamed: 0,temperature,humidity
0,23.3,43
1,24.4,44
2,22.1,50
3,22.0,44
4,31.3,46


In [66]:
# function to_datetime()

date_strings = ["2023-01-15 12:00:00", "2023-01-16 12:00:00", "2023-01-17 12:00:00", "2023-01-18 12:00:00", "2023-01-19 12:00:00"]

timestamps = pd.to_datetime(date_strings)

df_ts = df.copy()
df_ts["Timestamp"] = timestamps

df_ts.info()
df_ts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   temperature  5 non-null      float64       
 1   humidity     5 non-null      int64         
 2   Timestamp    5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 252.0 bytes


Unnamed: 0,temperature,humidity,Timestamp
0,23.3,43,2023-01-15 12:00:00
1,24.4,44,2023-01-16 12:00:00
2,22.1,50,2023-01-17 12:00:00
3,22.0,44,2023-01-18 12:00:00
4,31.3,46,2023-01-19 12:00:00


In [68]:
# class Timestamp

timestamps = []

for i in range(15, 20):
    timestamp = pd.Timestamp(year=2023, month=1, day=i, hour=12)
    timestamps.append(timestamp)

df_ts = df.copy()
df_ts["Timestamp"] = timestamps

df_ts.info()
df_ts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   temperature  5 non-null      float64       
 1   humidity     5 non-null      int64         
 2   Timestamp    5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 252.0 bytes


Unnamed: 0,temperature,humidity,Timestamp
0,23.3,43,2023-01-15 12:00:00
1,24.4,44,2023-01-16 12:00:00
2,22.1,50,2023-01-17 12:00:00
3,22.0,44,2023-01-18 12:00:00
4,31.3,46,2023-01-19 12:00:00


In [70]:
# function date_range()
timestamps = pd.date_range(start="2023-01-15 12:00:00", end="2023-01-19 12:00:00", freq="D")

df_ts = df.copy()
df_ts["Timestamp"] = timestamps

df_ts.info()
df_ts

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   temperature  5 non-null      float64       
 1   humidity     5 non-null      int64         
 2   Timestamp    5 non-null      datetime64[ns]
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 252.0 bytes


Unnamed: 0,temperature,humidity,Timestamp
0,23.3,43,2023-01-15 12:00:00
1,24.4,44,2023-01-16 12:00:00
2,22.1,50,2023-01-17 12:00:00
3,22.0,44,2023-01-18 12:00:00
4,31.3,46,2023-01-19 12:00:00


### Select by timestamp

In [79]:
data = {
    "temperature": [23.3, 24.4, 22.1, 22, 31.3],
    "humidity": [43, 44, 50, 44, 46],
    "timestamp" : [pd.Timestamp(year=2023, month=i, day=1, hour=12) for i in range(1,6)]
}

df = pd.DataFrame(data)
df.set_index("timestamp", inplace=True)

df

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 12:00:00,23.3,43
2023-02-01 12:00:00,24.4,44
2023-03-01 12:00:00,22.1,50
2023-04-01 12:00:00,22.0,44
2023-05-01 12:00:00,31.3,46


In [80]:
# select by date

df_sel = df.loc["2023-02-01"]

df_sel

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-01 12:00:00,24.4,44


In [81]:
# select by partial date
df_sel = df.loc["2023-03"]

df_sel

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-03-01 12:00:00,22.1,50


In [83]:
# select by slicing
df_sel = df.loc["2023-01-02":"2023-04-12"]

df_sel

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-02-01 12:00:00,24.4,44
2023-03-01 12:00:00,22.1,50
2023-04-01 12:00:00,22.0,44


In [89]:
# select by boolean array
print(type(df.index))
print(type(df.index > "2023-03-12"))
print(df.index > "2023-03-12")

df_sel = df[df.index > "2023-03-12"]

df_sel

<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
<class 'numpy.ndarray'>
[False False False  True  True]


Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-04-01 12:00:00,22.0,44
2023-05-01 12:00:00,31.3,46


### Manipulate time serie

In [101]:
data = {
    "temperature": [23.3, 24.4, 22.1, 22, 31.3],
    "humidity": [43, 44, 50, 44, 46],
    "timestamp" : [pd.Timestamp(year=2023, month=1, day=i, hour=12) for i in range(1,6)]
}

df = pd.DataFrame(data)
df.set_index("timestamp", inplace=True)

df

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 12:00:00,23.3,43
2023-01-02 12:00:00,24.4,44
2023-01-03 12:00:00,22.1,50
2023-01-04 12:00:00,22.0,44
2023-01-05 12:00:00,31.3,46


In [102]:
# resample
df_resampled = df.resample("12h").bfill()

df_resampled

Unnamed: 0_level_0,temperature,humidity
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1
2023-01-01 12:00:00,23.3,43
2023-01-02 00:00:00,24.4,44
2023-01-02 12:00:00,24.4,44
2023-01-03 00:00:00,22.1,50
2023-01-03 12:00:00,22.1,50
2023-01-04 00:00:00,22.0,44
2023-01-04 12:00:00,22.0,44
2023-01-05 00:00:00,31.3,46
2023-01-05 12:00:00,31.3,46


In [110]:
# extract hours
df_hour = df.copy()

df_hour["Hour"] = df_hour.index.hour

df_hour

Unnamed: 0_level_0,temperature,humidity,Hour
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01 12:00:00,23.3,43,12
2023-01-02 12:00:00,24.4,44,12
2023-01-03 12:00:00,22.1,50,12
2023-01-04 12:00:00,22.0,44,12
2023-01-05 12:00:00,31.3,46,12


In [112]:
# timestamp to string

df_ts_str = df.copy()

df_ts_str["ts_formatted"] = df_ts_str.index.strftime("%Y-%m-%d")

df_ts_str.info()
df_ts_str

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2023-01-01 12:00:00 to 2023-01-05 12:00:00
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   temperature   5 non-null      float64
 1   humidity      5 non-null      int64  
 2   ts_formatted  5 non-null      object 
dtypes: float64(1), int64(1), object(1)
memory usage: 160.0+ bytes


Unnamed: 0_level_0,temperature,humidity,ts_formatted
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-01 12:00:00,23.3,43,2023-01-01
2023-01-02 12:00:00,24.4,44,2023-01-02
2023-01-03 12:00:00,22.1,50,2023-01-03
2023-01-04 12:00:00,22.0,44,2023-01-04
2023-01-05 12:00:00,31.3,46,2023-01-05


In [113]:
# sort by date string values

df_sorted = df_ts_str.sort_values(by="ts_formatted", ascending=False)

df_sorted

Unnamed: 0_level_0,temperature,humidity,ts_formatted
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-01-05 12:00:00,31.3,46,2023-01-05
2023-01-04 12:00:00,22.0,44,2023-01-04
2023-01-03 12:00:00,22.1,50,2023-01-03
2023-01-02 12:00:00,24.4,44,2023-01-02
2023-01-01 12:00:00,23.3,43,2023-01-01
