In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Merge

In [14]:
df1 = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]})
df2 = pd.DataFrame({"ID": [2, 3, 4], "Name": ["NotBob", "Charlie", "Tom"], "Age": [25, 30, 35]})

print(df1)
print("-------------------")
print(df2)


   ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie
-------------------
   ID     Name  Age
0   2   NotBob   25
1   3  Charlie   30
2   4      Tom   35


In [15]:
# Merge on 1 column
merged_df = pd.merge(df1, df2[["ID", "Age"]], how="inner", on="ID")

merged_df

Unnamed: 0,ID,Name,Age
0,2,Bob,25
1,3,Charlie,30


In [16]:
# Merge on multiple column
merged_df = pd.merge(df1, df2, how="inner", on=["ID", "Name"])

merged_df

Unnamed: 0,ID,Name,Age
0,3,Charlie,30


In [18]:
# Merge on indexes
merged_df = pd.merge(df1, df2, left_index=True, right_index=True, how="inner", suffixes=("_left", "_right"))

merged_df

Unnamed: 0,ID_left,Name_left,ID_right,Name_right,Age
0,1,Alice,2,NotBob,25
1,2,Bob,3,Charlie,30
2,3,Charlie,4,Tom,35


# Join

In [20]:
df1 = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]}, index=[1, 2, 3])
df2 = pd.DataFrame({"ID": [2, 3, 3, 4], "Name": ["NotBob", "Charlie", "Charlie", "Tom"], "Age": [25, 30, 30, 35]}, index=[2, 3, 3, 4])

print(df1)
print("-------------------")
print(df2)

   ID     Name
1   1    Alice
2   2      Bob
3   3  Charlie
-------------------
   ID     Name  Age
2   2   NotBob   25
3   3  Charlie   30
3   3  Charlie   30
4   4      Tom   35


In [22]:
# inner on indexes
df_joined = df1.join(df2, how="inner", lsuffix="_left", rsuffix="_right")

df_joined

Unnamed: 0,ID_left,Name_left,ID_right,Name_right,Age
2,2,Bob,2,NotBob,25
3,3,Charlie,3,Charlie,30
3,3,Charlie,3,Charlie,30


In [23]:
# outer on indexes
df_joined = df1.join(df2, how="outer", lsuffix="_left", rsuffix="_right")

df_joined

Unnamed: 0,ID_left,Name_left,ID_right,Name_right,Age
1,1.0,Alice,,,
2,2.0,Bob,2.0,NotBob,25.0
3,3.0,Charlie,3.0,Charlie,30.0
3,3.0,Charlie,3.0,Charlie,30.0
4,,,4.0,Tom,35.0


In [24]:
# left on indexes
df_joined = df1.join(df2, how="left", lsuffix="_left", rsuffix="_right")

df_joined

Unnamed: 0,ID_left,Name_left,ID_right,Name_right,Age
1,1,Alice,,,
2,2,Bob,2.0,NotBob,25.0
3,3,Charlie,3.0,Charlie,30.0
3,3,Charlie,3.0,Charlie,30.0


In [25]:
# right on indexes
df_joined = df1.join(df2, how="right", lsuffix="_left", rsuffix="_right")

df_joined

Unnamed: 0,ID_left,Name_left,ID_right,Name_right,Age
2,2.0,Bob,2,NotBob,25
3,3.0,Charlie,3,Charlie,30
3,3.0,Charlie,3,Charlie,30
4,,,4,Tom,35


In [31]:
# inner on indexes with validation, raises error

df_joined = pd.DataFrame()

try:
    df_joined = df1.join(df2, how="inner", lsuffix="_left", rsuffix="_right", validate="one_to_one")
except pd.errors.MergeError :
    print("Not a 1:1 join")

df_joined

Not a 1:1 join


# Concat

In [43]:
df1 = pd.DataFrame({"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"], "City": ["Zurich", "Basel", "Zurich"]}, index=[1, 2, 3])
df2 = pd.DataFrame({"ID": [2, 3, 3, 4], "Name": ["NotBob", "Charlie", "Charlie", "Tom"], "Age": [25, 30, 30, 35]}, index=[1, 2, 3, 4])

print(df1)
print("-------------------")
print(df2)

   ID     Name    City
1   1    Alice  Zurich
2   2      Bob   Basel
3   3  Charlie  Zurich
-------------------
   ID     Name  Age
1   2   NotBob   25
2   3  Charlie   30
3   3  Charlie   30
4   4      Tom   35


In [35]:
df_concat = pd.concat([df1, df2])

df_concat

Unnamed: 0,ID,Name,City,Age
1,1,Alice,Zurich,
2,2,Bob,Basel,
3,3,Charlie,Zurich,
2,2,NotBob,,25.0
3,3,Charlie,,30.0
3,3,Charlie,,30.0
4,4,Tom,,35.0


In [56]:
df_concat = pd.concat([df1, df2], join="inner")

df_concat

Unnamed: 0,ID,Name
1,1,Alice
2,2,Bob
3,3,Charlie
1,2,NotBob
2,3,Charlie
3,3,Charlie
4,4,Tom


In [57]:
df_concat = pd.concat([df1, df2], ignore_index=True)

df_concat

Unnamed: 0,ID,Name,City,Age
0,1,Alice,Zurich,
1,2,Bob,Basel,
2,3,Charlie,Zurich,
3,2,NotBob,,25.0
4,3,Charlie,,30.0
5,3,Charlie,,30.0
6,4,Tom,,35.0


In [59]:
df_concat = pd.concat([df1, df2], keys=["df1", "df2"])

df_concat

Unnamed: 0,Unnamed: 1,ID,Name,City,Age
df1,1,1,Alice,Zurich,
df1,2,2,Bob,Basel,
df1,3,3,Charlie,Zurich,
df2,1,2,NotBob,,25.0
df2,2,3,Charlie,,30.0
df2,3,3,Charlie,,30.0
df2,4,4,Tom,,35.0


In [60]:
df_concat = pd.concat([df1, df2], axis=1)

df_concat

Unnamed: 0,0,1,2,3,4,5
1,1.0,Alice,Zurich,2,NotBob,25
2,2.0,Bob,Basel,3,Charlie,30
3,3.0,Charlie,Zurich,3,Charlie,30
4,,,,4,Tom,35


# Cut function

In [66]:
ages = [10, 15, 20, 24, 26, 30, 31, 35, 40, 46, 50, 55, 60, 70, 100]

bin_edges = [20, 40, 60, 80]

In [68]:
age_groups = pd.cut(ages, bins=bin_edges, labels=["20-40", "40-60", "60-80"])

df = pd.DataFrame({"Age": ages, "Age_group": age_groups})

df

Unnamed: 0,Age,Age_group
0,10,
1,15,
2,20,
3,24,20-40
4,26,20-40
5,30,20-40
6,31,20-40
7,35,20-40
8,40,20-40
9,46,40-60


In [69]:
age_groups = pd.cut(ages, bins=bin_edges, labels=["20-40", "40-60", "60-80"], include_lowest=True)

df = pd.DataFrame({"Age": ages, "Age_group": age_groups})

df

Unnamed: 0,Age,Age_group
0,10,
1,15,
2,20,20-40
3,24,20-40
4,26,20-40
5,30,20-40
6,31,20-40
7,35,20-40
8,40,20-40
9,46,40-60


In [70]:
age_groups = pd.cut(ages, bins=bin_edges, labels=["20-40", "40-60", "60-80"], include_lowest=True, right=False)

df = pd.DataFrame({"Age": ages, "Age_group": age_groups})

df

Unnamed: 0,Age,Age_group
0,10,
1,15,
2,20,20-40
3,24,20-40
4,26,20-40
5,30,20-40
6,31,20-40
7,35,20-40
8,40,40-60
9,46,40-60


# QCut function

In [74]:
scores = [65, 72, 78, 80, 85, 90, 95, 98, 99]

quaniles = [0, 0.33, 0.66, 1]

labels=["Low", "Medium", "High"]

In [75]:
performance_categories = pd.qcut(scores, q=quaniles, labels=labels)

df = pd.DataFrame({"Scores": scores, "Performance_category": performance_categories})

df

Unnamed: 0,Scores,Performance_category
0,65,Low
1,72,Low
2,78,Low
3,80,Medium
4,85,Medium
5,90,Medium
6,95,High
7,98,High
8,99,High


# Pivot

In [3]:
data = {
    "Date": ["2024-01-01","2024-01-01","2024-01-02","2024-01-02"],
    "Product": ["A", "B", "A", "B"],
    "Amount": [100, 150, 120, 180]
}

df = pd.DataFrame(data)

df

Unnamed: 0,Date,Product,Amount
0,2024-01-01,A,100
1,2024-01-01,B,150
2,2024-01-02,A,120
3,2024-01-02,B,180


In [16]:
df_pivot = df.pivot(index="Date", columns="Product", values="Amount")

df_pivot

Product,A,B
Date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-01-01,100,150
2024-01-02,120,180


In [21]:
df_pivot.columns

Index(['A', 'B'], dtype='object', name='Product')

In [22]:
df_pivot.index

Index(['2024-01-01', '2024-01-02'], dtype='object', name='Date')

# Melt

In [23]:
data = {
    "Participant": ["Alice", "Bob", "Charlie"],
    "Q1": [4, 5, 3],
    "Q2": [5, 3, 4],
    "Q3": [3, 4, 5],
}

df = pd.DataFrame(data)

df

Unnamed: 0,Participant,Q1,Q2,Q3
0,Alice,4,5,3
1,Bob,5,3,4
2,Charlie,3,4,5


In [24]:
df_melt = pd.melt(df, id_vars="Participant", value_vars=["Q1", "Q2", "Q3"], var_name= "Question", value_name="Response")

df_melt

Unnamed: 0,Participant,Question,Response
0,Alice,Q1,4
1,Bob,Q1,5
2,Charlie,Q1,3
3,Alice,Q2,5
4,Bob,Q2,3
5,Charlie,Q2,4
6,Alice,Q3,3
7,Bob,Q3,4
8,Charlie,Q3,5


# Crosstab

In [25]:
data = {
    "Gender": ["M", "F", "M", "F", "F"],
    "Education": ["High school", "College", "High school", "College", "Graduate"],
    "Satisfaction": [3, 4, 5, 4, 5],
}

df = pd.DataFrame(data)

df

Unnamed: 0,Gender,Education,Satisfaction
0,M,High school,3
1,F,College,4
2,M,High school,5
3,F,College,4
4,F,Graduate,5


In [27]:
df_crosstab = pd.crosstab(index=[df["Gender"], df["Education"]], columns=df["Satisfaction"])

df_crosstab

Unnamed: 0_level_0,Satisfaction,3,4,5
Gender,Education,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
F,College,0,2,0
F,Graduate,0,0,1
M,High school,1,0,1


In [28]:
df_crosstab = pd.crosstab(index=[df["Gender"], df["Education"]], columns=df["Satisfaction"], margins=True)

df_crosstab

Unnamed: 0_level_0,Satisfaction,3,4,5,All
Gender,Education,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,College,0,2,0,2
F,Graduate,0,0,1,1
M,High school,1,0,1,2
All,,1,2,2,5


In [29]:
df_crosstab = pd.crosstab(index=[df["Gender"], df["Education"]], columns=df["Satisfaction"], margins=True, margins_name="Total")

df_crosstab

Unnamed: 0_level_0,Satisfaction,3,4,5,Total
Gender,Education,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
F,College,0,2,0,2
F,Graduate,0,0,1,1
M,High school,1,0,1,2
Total,,1,2,2,5
