In [1]:
import pandas as pd

df = pd.DataFrame({
    "festival": ["Carnival", "Oktoberfest", "La Tomatina", "Rio New Year", "Harbin Ice Festival", "Lantern Festival", "Cherry Blossom Festival", "Dia de los Muertos"],
    "country": ["Brazil", "Germany", "Spain", "Brazil", "China", "China", "Japan", "Mexico"],
    "month": ["February", "September", "August", "January", "January", "February", "April", "November"],
    "theme": ["Parade", "Beer Celebration", "Tomato Fight", "Fireworks", "Ice Sculptures", "Lanterns", "Nature", "Tradition"],
    "popularity_score": [98, 95, 88, 92, 90, 85, 89, 91]
})


**1️⃣ Load and Inspect the Dataset**

Load dataset

In [3]:
df

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
2,La Tomatina,Spain,August,Tomato Fight,88
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90
5,Lantern Festival,China,February,Lanterns,85
6,Cherry Blossom Festival,Japan,April,Nature,89
7,Dia de los Muertos,Mexico,November,Tradition,91


Check the first 5 rows 

In [4]:
df.head(5)

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
2,La Tomatina,Spain,August,Tomato Fight,88
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90


Check the last 3 rows

In [5]:
df.tail(3)

Unnamed: 0,festival,country,month,theme,popularity_score
5,Lantern Festival,China,February,Lanterns,85
6,Cherry Blossom Festival,Japan,April,Nature,89
7,Dia de los Muertos,Mexico,November,Tradition,91


Inspect columns, data types, and shape

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8 entries, 0 to 7
Data columns (total 5 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   festival          8 non-null      object
 1   country           8 non-null      object
 2   month             8 non-null      object
 3   theme             8 non-null      object
 4   popularity_score  8 non-null      int64 
dtypes: int64(1), object(4)
memory usage: 448.0+ bytes


Get summary statistics

In [None]:
df.describe()# this wont work as there are no numerical columns

**2️⃣ Basic Selection**

Select only the festival column

In [8]:
df.festival

0                   Carnival
1                Oktoberfest
2                La Tomatina
3               Rio New Year
4        Harbin Ice Festival
5           Lantern Festival
6    Cherry Blossom Festival
7         Dia de los Muertos
Name: festival, dtype: object

Select festival and country columns

In [9]:
df[['festival','country']]

Unnamed: 0,festival,country
0,Carnival,Brazil
1,Oktoberfest,Germany
2,La Tomatina,Spain
3,Rio New Year,Brazil
4,Harbin Ice Festival,China
5,Lantern Festival,China
6,Cherry Blossom Festival,Japan
7,Dia de los Muertos,Mexico


Select first 3 rows using iloc

In [10]:
df.iloc[:3]

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
2,La Tomatina,Spain,August,Tomato Fight,88


Select rows 2–5 using loc

In [11]:
df.loc[2:5]

Unnamed: 0,festival,country,month,theme,popularity_score
2,La Tomatina,Spain,August,Tomato Fight,88
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90
5,Lantern Festival,China,February,Lanterns,85


**3️⃣ Filtering Rows**

Festivals celebrated in China

In [13]:
df[df.country=="China"]

Unnamed: 0,festival,country,month,theme,popularity_score
4,Harbin Ice Festival,China,January,Ice Sculptures,90
5,Lantern Festival,China,February,Lanterns,85


Festivals celebrated in Brazil or Japan

In [14]:
df[df.country.isin(["Brazil","Japan"])]

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
3,Rio New Year,Brazil,January,Fireworks,92
6,Cherry Blossom Festival,Japan,April,Nature,89


Festivals with popularity_score > 90

In [15]:
df[df.popularity_score>90]

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
3,Rio New Year,Brazil,January,Fireworks,92
7,Dia de los Muertos,Mexico,November,Tradition,91


Festivals in January

In [16]:
df[df.month=="January"]

Unnamed: 0,festival,country,month,theme,popularity_score
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90


Festivals where theme contains "Lantern" or "Ice" 

In [21]:
df[df.theme.str.contains("Lantern|Ice")] # use | instead of ',', because we would by mistake forget its a string not a list

Unnamed: 0,festival,country,month,theme,popularity_score
4,Harbin Ice Festival,China,January,Ice Sculptures,90
5,Lantern Festival,China,February,Lanterns,85


Festivals in China and popularity > 85

In [22]:
df[(df["country"]=="China") & (df["popularity_score"]>85)]

Unnamed: 0,festival,country,month,theme,popularity_score
4,Harbin Ice Festival,China,January,Ice Sculptures,90


Festivals in Japan or popularity > 90

In [None]:
df[(df["country"]=="Japan") & (df["popularity_score"]>90)] # this will return empty dataframe as no such record exists

Unnamed: 0,festival,country,month,theme,popularity_score


**4️⃣ Sorting**

Sort festivals by popularity_score descending

In [25]:
df.sort_values("popularity_score", ascending=False)

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
3,Rio New Year,Brazil,January,Fireworks,92
7,Dia de los Muertos,Mexico,November,Tradition,91
4,Harbin Ice Festival,China,January,Ice Sculptures,90
6,Cherry Blossom Festival,Japan,April,Nature,89
2,La Tomatina,Spain,August,Tomato Fight,88
5,Lantern Festival,China,February,Lanterns,85


Sort festivals by month alphabetically

In [26]:
df.sort_values("month")

Unnamed: 0,festival,country,month,theme,popularity_score
6,Cherry Blossom Festival,Japan,April,Nature,89
2,La Tomatina,Spain,August,Tomato Fight,88
0,Carnival,Brazil,February,Parade,98
5,Lantern Festival,China,February,Lanterns,85
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90
7,Dia de los Muertos,Mexico,November,Tradition,91
1,Oktoberfest,Germany,September,Beer Celebration,95


Sort by country, then by popularity_score descending

In [28]:
df.sort_values(["country","popularity_score"],ascending=[True,False])

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
3,Rio New Year,Brazil,January,Fireworks,92
4,Harbin Ice Festival,China,January,Ice Sculptures,90
5,Lantern Festival,China,February,Lanterns,85
1,Oktoberfest,Germany,September,Beer Celebration,95
6,Cherry Blossom Festival,Japan,April,Nature,89
7,Dia de los Muertos,Mexico,November,Tradition,91
2,La Tomatina,Spain,August,Tomato Fight,88


**5️⃣ Highlighting Key Insights**

Most popular festival (idxmax())

In [31]:
df.loc[df.popularity_score.idxmax(),"festival"]

'Carnival'

Least popular festival (idxmin())

In [35]:
df.loc[df.popularity_score.idxmin(),"festival"]

'Lantern Festival'

Top 3 festivals by popularity

In [36]:
df.sort_values("popularity_score",ascending=False).head(3)

Unnamed: 0,festival,country,month,theme,popularity_score
0,Carnival,Brazil,February,Parade,98
1,Oktoberfest,Germany,September,Beer Celebration,95
3,Rio New Year,Brazil,January,Fireworks,92


Bottom 3 festivals by popularity

In [37]:
df.sort_values("popularity_score",ascending=False).tail(3)

Unnamed: 0,festival,country,month,theme,popularity_score
6,Cherry Blossom Festival,Japan,April,Nature,89
2,La Tomatina,Spain,August,Tomato Fight,88
5,Lantern Festival,China,February,Lanterns,85


**6 Adding / Modifying Columns**

Add is_popular → True if popularity_score > 90

In [40]:
def t(popularity_score):
    if popularity_score>90:
        return True
    else:
        return False
    
df["is_popular"]=df.popularity_score.apply(t)  
df  

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular
0,Carnival,Brazil,February,Parade,98,True
1,Oktoberfest,Germany,September,Beer Celebration,95,True
2,La Tomatina,Spain,August,Tomato Fight,88,False
3,Rio New Year,Brazil,January,Fireworks,92,True
4,Harbin Ice Festival,China,January,Ice Sculptures,90,False
5,Lantern Festival,China,February,Lanterns,85,False
6,Cherry Blossom Festival,Japan,April,Nature,89,False
7,Dia de los Muertos,Mexico,November,Tradition,91,True


Add festival_length → length of festival name

In [45]:

df["festival_length"]=df.festival.str.len()
df

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular,festival_length
0,Carnival,Brazil,February,Parade,98,True,8
1,Oktoberfest,Germany,September,Beer Celebration,95,True,11
2,La Tomatina,Spain,August,Tomato Fight,88,False,11
3,Rio New Year,Brazil,January,Fireworks,92,True,12
4,Harbin Ice Festival,China,January,Ice Sculptures,90,False,19
5,Lantern Festival,China,February,Lanterns,85,False,16
6,Cherry Blossom Festival,Japan,April,Nature,89,False,23
7,Dia de los Muertos,Mexico,November,Tradition,91,True,18


Capitalize all festival names (.str.upper())

In [50]:
df["festival"]=df.festival.str.upper()
df

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular,festival_length,popularity_level
0,CARNIVAL,Brazil,February,Parade,98,True,8,Very popular
1,OKTOBERFEST,Germany,September,Beer Celebration,95,True,11,Very popular
2,LA TOMATINA,Spain,August,Tomato Fight,88,False,11,Popular
3,RIO NEW YEAR,Brazil,January,Fireworks,92,True,12,Very popular
4,HARBIN ICE FESTIVAL,China,January,Ice Sculptures,90,False,19,Popular
5,LANTERN FESTIVAL,China,February,Lanterns,85,False,16,Popular
6,CHERRY BLOSSOM FESTIVAL,Japan,April,Nature,89,False,23,Popular
7,DIA DE LOS MUERTOS,Mexico,November,Tradition,91,True,18,Very popular


**7 Combining Conditions**

Festivals in Japan or popularity > 90

In [46]:
df[(df.country=="Japan") | (df.popularity_score>90)]

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular,festival_length
0,Carnival,Brazil,February,Parade,98,True,8
1,Oktoberfest,Germany,September,Beer Celebration,95,True,11
3,Rio New Year,Brazil,January,Fireworks,92,True,12
6,Cherry Blossom Festival,Japan,April,Nature,89,False,23
7,Dia de los Muertos,Mexico,November,Tradition,91,True,18


Festivals in January and theme is "Ice Sculptures"

In [47]:
df[(df.month=="January") & (df.theme.isin(["Ice Sculptures"]))]

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular,festival_length
4,Harbin Ice Festival,China,January,Ice Sculptures,90,False,19


Create a popularity category column:

- 90 → "Very Popular"

- 80–90 → "Popular"

- <80 → "Less Popular"

In [48]:
def popularity(popularity_score):
    if popularity_score>90:
        return " Very popular"
    elif 80 <= popularity_score <= 90:
        return "Popular"
    elif popularity_score < 80:
        return "Less popular"
df["popularity_level"]=df.popularity_score.apply(popularity)
df
    

Unnamed: 0,festival,country,month,theme,popularity_score,is_popular,festival_length,popularity_level
0,Carnival,Brazil,February,Parade,98,True,8,Very popular
1,Oktoberfest,Germany,September,Beer Celebration,95,True,11,Very popular
2,La Tomatina,Spain,August,Tomato Fight,88,False,11,Popular
3,Rio New Year,Brazil,January,Fireworks,92,True,12,Very popular
4,Harbin Ice Festival,China,January,Ice Sculptures,90,False,19,Popular
5,Lantern Festival,China,February,Lanterns,85,False,16,Popular
6,Cherry Blossom Festival,Japan,April,Nature,89,False,23,Popular
7,Dia de los Muertos,Mexico,November,Tradition,91,True,18,Very popular


Checking unique values

In [49]:
df.month.unique()

array(['February', 'September', 'August', 'January', 'April', 'November'],
      dtype=object)