## Library Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data loading & Analysis

In [None]:
raw_df = pd.read_csv("/kaggle/input/population-2022/population.csv")

In [None]:
raw_df.head()

In [None]:
raw_df.tail()

In [None]:
raw_df.describe()

In [None]:
raw_df.info()

In [None]:
raw_df.isna().sum()

In [None]:
raw_df.columns

In [None]:
dict = {'year': raw_df['Year'], 'country':raw_df['Country'], 'population':raw_df['Population']}
sub_df = pd.DataFrame(dict)
sub_df.head()

## Exploratory Data Analysis

### Growth of population YoY of different countries (top 100 and lowest 100 data points)

In [None]:
plt.figure(figsize=(12,5))
data = sub_df.sort_values(by=['population'], ascending=False)[:100]
sns.lineplot(data=data, x='year', y='population', hue='country', legend=True)
plt.show()

plt.figure(figsize=(12,5))
data = sub_df.sort_values(by=['population'], ascending=True)[:100]
sns.lineplot(data=data, x='year', y='population', hue='country', legend=True)
plt.show()

### Top 20 and Least 20 most populated countries as on year 2020

In [None]:
data = sub_df[sub_df['year'] == 2020].sort_values(by=['population'], ascending=False)[:20]
plot = sns.barplot(data=data, x="country", y="population")
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

data = sub_df[sub_df['year'] == 2020].sort_values(by=['population'], ascending=True)[:20]
plot = sns.barplot(data=data, x="country", y="population")
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

### top 20 and least 20 global ranked countries as per population as on year 2020

In [None]:
dict = {'year': raw_df['Year'], 'country':raw_df['Country'], 'rank':raw_df['GlobalRank']}
sub_df = pd.DataFrame(dict)
data = sub_df[sub_df['year'] == 2020].sort_values(by=['rank'], ascending=True)[:20]
plot = sns.barplot(data=data, x="country", y="rank", )
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

data = sub_df[sub_df['year'] == 2020].sort_values(by=['rank'], ascending=False)[:20]
plot = sns.barplot(data=data, x="country", y="rank", )
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

### top 20 and least 20 yearly % change of countries as of year 2020

In [None]:
dict = {'year': raw_df['Year'], 'country':raw_df['Country'], 'yearly%change':raw_df['Yearly%Change']}
sub_df = pd.DataFrame(dict)
data = sub_df[sub_df['year'] == 2020].sort_values(by=['yearly%change'], ascending=False)[:20]
plot = sns.barplot(data=data, x="country", y="yearly%change", )
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

data = sub_df[sub_df['year'] == 2020].sort_values(by=['yearly%change'], ascending=True)[:20]
plot = sns.barplot(data=data, x="country", y="yearly%change", )
plt.figure(figsize=(12,5))
plt.setp(plot.get_xticklabels(), rotation=90)
plt.show()

### top 30 countries median age distribution plot

In [None]:
dict = {'year': raw_df['Year'], 'country':raw_df['Country'], 'age':raw_df['MedianAge']}
sub_df = pd.DataFrame(dict)
data = sub_df.sort_values(by=['age'], ascending=True)[sub_df['age'] != 0.0][:30]
plt.figure(figsize=(12,5))
sns.displot(data=data, x="age", hue="country", kind="kde")
plt.show()

### bivariate analysis of top 30 countries | median age vs fertility rate

In [None]:
dict = {'year': raw_df['Year'], 'country':raw_df['Country'], 'age':raw_df['MedianAge'], 'fertility': raw_df['FertilityRate']}
sub_df = pd.DataFrame(dict)
data = sub_df.sort_values(by=['age'], ascending=True)[sub_df['age'] != 0.0][:30]
plt.figure(figsize=(12,5))
sns.violinplot(data=data, x="age", y="fertility")
plt.show()