# Practice with Pandas



## Import Data from kaggle

In [19]:
import kagglehub

# Authenticate
# kagglehub.login() # This will prompt you for your credentials.
# We also offer other ways to authenticate (credential file & env variables): https://github.com/Kaggle/kagglehub?tab=readme-ov-file#authenticate

from kaggle.api.kaggle_api_extended import KaggleApi

api = KaggleApi()
api.authenticate()

In [37]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ibrarhussain123/world-largest-cities-by-population-2024")

print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/ibrarhussain123/world-largest-cities-by-population-2024?dataset_version_number=1...


100%|██████████| 16.9k/16.9k [00:00<00:00, 1.04MB/s]

Extracting files...
Path to dataset files: C:\Users\skyfree\.cache\kagglehub\datasets\ibrarhussain123\world-largest-cities-by-population-2024\versions\1





In [51]:
# Move the dataset
import os, shutil

files = os.listdir(path)

shutil.move(f'{path}/{files[0]}', "../Data/Datasets/big_cities.csv")

'../Data/Datasets/big_cities.csv'

## Load Data into Pandas

In [56]:
import pandas as pd

file_loc = "../Data/Datasets/big_cities.csv"
df = pd.read_csv(file_loc)

df.head()

Unnamed: 0.1,Unnamed: 0,City,Country,Population (2024),Population (2023),Growth Rate
0,0,Tokyo,Japan,37115035,37194105,-0.0021
1,1,Delhi,India,33807403,32941309,0.0263
2,2,Shanghai,China,29867918,29210808,0.0225
3,3,Dhaka,Bangladesh,23935652,23209616,0.0313
4,4,Sao Paulo,Brazil,22806704,22619736,0.0083


In [59]:
# Drop the Unnamed column
df.drop(df.columns[0], axis= 1, inplace=True)

## Explore

In [61]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 801 entries, 0 to 800
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               801 non-null    object 
 1   Country            801 non-null    object 
 2   Population (2024)  801 non-null    int64  
 3   Population (2023)  801 non-null    int64  
 4   Growth Rate        801 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 31.4+ KB
None


In [62]:
print(df.describe())

       Population (2024)  Population (2023)  Growth Rate
count       8.010000e+02       8.010000e+02   801.000000
mean        2.654327e+06       2.604461e+06     0.020051
std         3.723253e+06       3.661201e+06     0.012180
min         7.500360e+05       7.228360e+05    -0.024900
25%         9.909310e+05       9.698040e+05     0.012200
50%         1.379368e+06       1.363510e+06     0.019700
75%         2.570980e+06       2.514077e+06     0.026600
max         3.711504e+07       3.719410e+07     0.058200


### Warmup Exercises

This part is a follow along exercise Learn Data Analytics with Pandas: Master Data Filtering in Python from CodeWithJosh

In [64]:
## Find cities with 2024 population greater than 20 million

condition = df["Population (2024)"] > 20e+6

cities_more_than_20min_2024 = df[condition]

print(cities_more_than_20min_2024.head())
print(cities_more_than_20min_2024.info())
print(cities_more_than_20min_2024.describe())

        City     Country  Population (2024)  Population (2023)  Growth Rate
0      Tokyo       Japan           37115035           37194105      -0.0021
1      Delhi       India           33807403           32941309       0.0263
2   Shanghai       China           29867918           29210808       0.0225
3      Dhaka  Bangladesh           23935652           23209616       0.0313
4  Sao Paulo      Brazil           22806704           22619736       0.0083
<class 'pandas.core.frame.DataFrame'>
Index: 9 entries, 0 to 8
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               9 non-null      object 
 1   Country            9 non-null      object 
 2   Population (2024)  9 non-null      int64  
 3   Population (2023)  9 non-null      int64  
 4   Growth Rate        9 non-null      float64
dtypes: float64(1), int64(2), object(2)
memory usage: 432.0+ bytes
None
       Population (2024)  Population (202

In [68]:
## Find largest cities in Vietnam

condition = df["Country"] == "Vietnam"

cities_Vietnam = df[condition]

print(cities_Vietnam.head())
print(cities_Vietnam.info())
print(cities_Vietnam.describe())

                 City  Country  Population (2024)  Population (2023)  \
38   Ho Chi Minh City  Vietnam            9567656            9320866   
79              Hanoi  Vietnam            5431801            5253385   
272           Can Tho  Vietnam            1938915            1865172   
375         Hai Phong  Vietnam            1463650            1422974   
460           Da Nang  Vietnam            1253228            1220634   

     Growth Rate  
38        0.0265  
79        0.0340  
272       0.0395  
375       0.0286  
460       0.0267  
<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, 38 to 508
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               6 non-null      object 
 1   Country            6 non-null      object 
 2   Population (2024)  6 non-null      int64  
 3   Population (2023)  6 non-null      int64  
 4   Growth Rate        6 non-null      float64
dtypes: float64(1),

In [71]:
## Find the Growing Cities in China

condition1 = df["Country"] == "China"
condition2 = df["Growth Rate"] > 0

growing_cities_in_China = df[condition1 & condition2]

print(growing_cities_in_China.head())
print(growing_cities_in_China.info())
print(growing_cities_in_China.describe())

         City Country  Population (2024)  Population (2023)  Growth Rate
2    Shanghai   China           29867918           29210808       0.0225
7     Beijing   China           22189082           21766214       0.0194
10  Chongqing   China           17773923           17340704       0.0250
18  Guangzhou   China           14590096           14284353       0.0214
19    Tianjin   China           14470873           14238643       0.0163
<class 'pandas.core.frame.DataFrame'>
Index: 225 entries, 2 to 800
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   City               225 non-null    object 
 1   Country            225 non-null    object 
 2   Population (2024)  225 non-null    int64  
 3   Population (2023)  225 non-null    int64  
 4   Growth Rate        225 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 10.5+ KB
None
       Population (2024)  Population (2023)  Growth Rate
co

In [72]:
## Total 2024 population of largest cities in India
condition = df["Country"] == "India"

total_Pop_India = df[condition]["Population (2024)"].sum()

print(total_Pop_India)

262676454


In [73]:
## Find cities with a growth rate decline and calculate their growth median
condition = df["Growth Rate"] < 0

df[condition]["Growth Rate"].median()

-0.0019

In [75]:
## Find the minimum 2023 population from the growing cities in China

growing_cities_in_China["Population (2023)"].min()

733810