In [1]:
import numpy as np # type: ignore
import pandas as pd # type: ignore

In [2]:
# Series with multiple index

index_val =[('cse', 2019), ('cse', 2020), ('cse', 2021), ('cse', 2022), ('eee', 2019), ('eee', 2020), ('eee', 2021), ('eee', 2022)]
a = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index = index_val)
a['cse', 2022]
# a['cse'] # but here create error

np.int64(4)

In [3]:
# proper multi-index object 

#01
index_val =[('cse', 2019), ('cse', 2020), ('cse', 2021), ('cse', 2022), ('eee', 2019), ('eee', 2020), ('eee', 2021), ('eee', 2022)]
multiindex = pd.MultiIndex.from_tuples(index_val)
multiindex.levels # de-couple list
multiindex.levels[0]
multiindex.levels[1]
 
#02
pd.MultiIndex.from_product([['cse', 'eee'], [2019, 2020, 2021, 2022]])

MultiIndex([('cse', 2019),
            ('cse', 2020),
            ('cse', 2021),
            ('cse', 2022),
            ('eee', 2019),
            ('eee', 2020),
            ('eee', 2021),
            ('eee', 2022)],
           )

In [4]:
# series form multiindex object 

s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index = multiindex)
s
s['cse']

2019    1
2020    2
2021    3
2022    4
dtype: int64

In [5]:
# unstack -> multiindex series to dataframe

temp = s.unstack()
temp

Unnamed: 0,2019,2020,2021,2022
cse,1,2,3,4
eee,5,6,7,8


In [6]:
# stack -> dataframe to multiindex series

temp.stack()

cse  2019    1
     2020    2
     2021    3
     2022    4
eee  2019    5
     2020    6
     2021    7
     2022    8
dtype: int64

In [7]:
# why multiindex series (convert any higher dimension(3d/4d) -> 1d/2d)
# example of 3d -> to find a value we need 3 parameter

branch_df1 = pd.DataFrame(
    [
        [1,2],
        [3,4],
        [5,6],
        [7,8],
        [9,10],
        [11,12],
        [13,14],
        [15,16],
    ],
    index = multiindex,
    columns = ['avg_package','students']
)

branch_df1
branch_df1.shape
branch_df1.loc['cse']
branch_df1.loc['cse', 2019]
branch_df1['avg_package']

cse  2019     1
     2020     3
     2021     5
     2022     7
eee  2019     9
     2020    11
     2021    13
     2022    15
Name: avg_package, dtype: int64

In [8]:
# multiindex df from columns perspective

branch_df2 = pd.DataFrame(
    [
        [1, 2, 0, 0],
        [3, 4, 0, 0],
        [5, 6, 0, 0],
        [7, 8, 0, 0],
    ],
    index = [2019, 2020, 2021, 2022],
    columns = pd.MultiIndex.from_product([['delhi', 'mumbai'],['avg_package', 'students']])
)

branch_df2
branch_df2['delhi']['avg_package']
branch_df2.loc[2019]

delhi   avg_package    1
        students       2
mumbai  avg_package    0
        students       0
Name: 2019, dtype: int64

In [9]:
# Multiindex df in terms of both cols and index

branch_df3 = pd.DataFrame(
    [
        [1, 2, 0, 0],
        [3, 4, 0, 0],
        [5, 6, 0, 0],
        [7, 8, 0, 0],
        [9, 10, 0, 0],
        [11, 12, 0, 0],
        [13, 14, 0, 0],
        [15, 16, 0, 0],
    ],
    index = multiindex,
    columns = pd.MultiIndex.from_product([['delhi','mumbai'],['avg_package','students']])
)

branch_df3 # to find a value we need 4 parameter so 4d

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
eee,2019,9,10,0,0
eee,2020,11,12,0,0
eee,2021,13,14,0,0
eee,2022,15,16,0,0


In [10]:
# stack -> column(level 1) will be row
# unstack -> row will be column

branch_df1
branch_df1.unstack()
branch_df1.unstack(level = 1) # level start with 0
branch_df1.unstack().stack()
branch_df1.unstack().stack().stack() # series

  branch_df1.unstack().stack()
  branch_df1.unstack().stack().stack() # series


cse  2019  avg_package     1
           students        2
     2020  avg_package     3
           students        4
     2021  avg_package     5
           students        6
     2022  avg_package     7
           students        8
eee  2019  avg_package     9
           students       10
     2020  avg_package    11
           students       12
     2021  avg_package    13
           students       14
     2022  avg_package    15
           students       16
dtype: int64

In [11]:
branch_df2 # dataframe
branch_df2.unstack() # series
branch_df2.stack() # dataframe
branch_df2.stack().stack() # series

  branch_df2.stack() # dataframe
  branch_df2.stack().stack() # series


2019  avg_package  delhi     1
                   mumbai    0
      students     delhi     2
                   mumbai    0
2020  avg_package  delhi     3
                   mumbai    0
      students     delhi     4
                   mumbai    0
2021  avg_package  delhi     5
                   mumbai    0
      students     delhi     6
                   mumbai    0
2022  avg_package  delhi     7
                   mumbai    0
      students     delhi     8
                   mumbai    0
dtype: int64

In [12]:
branch_df3
branch_df3.unstack()
branch_df3.unstack().unstack()

branch_df3.stack()
branch_df3.stack().stack()

  branch_df3.stack()
  branch_df3.stack().stack()


cse  2019  avg_package  delhi      1
                        mumbai     0
           students     delhi      2
                        mumbai     0
     2020  avg_package  delhi      3
                        mumbai     0
           students     delhi      4
                        mumbai     0
     2021  avg_package  delhi      5
                        mumbai     0
           students     delhi      6
                        mumbai     0
     2022  avg_package  delhi      7
                        mumbai     0
           students     delhi      8
                        mumbai     0
eee  2019  avg_package  delhi      9
                        mumbai     0
           students     delhi     10
                        mumbai     0
     2020  avg_package  delhi     11
                        mumbai     0
           students     delhi     12
                        mumbai     0
     2021  avg_package  delhi     13
                        mumbai     0
           students     delhi     14
 

In [13]:
branch_df3.head()
branch_df3.tail()
branch_df3.shape
branch_df3.info()
branch_df3.unstack().info()
branch_df3.describe()
branch_df3.duplicated()
branch_df3.isnull()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8 entries, ('cse', np.int64(2019)) to ('eee', np.int64(2022))
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   (delhi, avg_package)   8 non-null      int64
 1   (delhi, students)      8 non-null      int64
 2   (mumbai, avg_package)  8 non-null      int64
 3   (mumbai, students)     8 non-null      int64
dtypes: int64(4)
memory usage: 932.0+ bytes
<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, cse to eee
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   (delhi, avg_package, 2019)   2 non-null      int64
 1   (delhi, avg_package, 2020)   2 non-null      int64
 2   (delhi, avg_package, 2021)   2 non-null      int64
 3   (delhi, avg_package, 2022)   2 non-null      int64
 4   (delhi, students, 2019)      2 non-null      int64
 5   (delhi, student

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,False,False,False,False
cse,2020,False,False,False,False
cse,2021,False,False,False,False
cse,2022,False,False,False,False
eee,2019,False,False,False,False
eee,2020,False,False,False,False
eee,2021,False,False,False,False
eee,2022,False,False,False,False


In [14]:
# extracting row(single)

branch_df3
branch_df3.loc['cse', 2022]

# extracting row(multiple)

branch_df3.iloc[[0, 2, 4]]
branch_df3.iloc[0:5:2]

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2019,1,2,0,0
cse,2021,5,6,0,0
eee,2019,9,10,0,0


In [15]:
# extracting col(single)

branch_df3['delhi']
branch_df3['delhi']['students'] 

# extracting col(multiple)

branch_df3
branch_df3.iloc[:, 1:3]
branch_df3.iloc[:, [0, 3]]

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students
cse,2019,1,0
cse,2020,3,0
cse,2021,5,0
cse,2022,7,0
eee,2019,9,0
eee,2020,11,0
eee,2021,13,0
eee,2022,15,0


In [16]:
# extracting both

branch_df3.iloc[[0, 4], [1, 2]]

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,students,avg_package
cse,2019,2,0
eee,2019,10,0


In [17]:
# sort_index

branch_df3
branch_df3.sort_index(ascending = False)
branch_df3.sort_index(ascending = [False, True]) # level 0 descending, level 1 ascending
branch_df3.sort_index(level = 1, ascending=[False]) # level 0 ascending, level 1 descending

Unnamed: 0_level_0,Unnamed: 1_level_0,delhi,delhi,mumbai,mumbai
Unnamed: 0_level_1,Unnamed: 1_level_1,avg_package,students,avg_package,students
cse,2022,7,8,0,0
eee,2022,15,16,0,0
cse,2021,5,6,0,0
eee,2021,13,14,0,0
cse,2020,3,4,0,0
eee,2020,11,12,0,0
cse,2019,1,2,0,0
eee,2019,9,10,0,0


In [18]:
#  transpose 

branch_df3.transpose()

Unnamed: 0_level_0,Unnamed: 1_level_0,cse,cse,cse,cse,eee,eee,eee,eee
Unnamed: 0_level_1,Unnamed: 1_level_1,2019,2020,2021,2022,2019,2020,2021,2022
delhi,avg_package,1,3,5,7,9,11,13,15
delhi,students,2,4,6,8,10,12,14,16
mumbai,avg_package,0,0,0,0,0,0,0,0
mumbai,students,0,0,0,0,0,0,0,0


In [19]:
# swaplevel

branch_df3
branch_df3.swaplevel(axis = 0) # swap inner, outer row
branch_df3.swaplevel(axis = 1) # swap inner, outer column

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_package,students,avg_package,students
Unnamed: 0_level_1,Unnamed: 1_level_1,delhi,delhi,mumbai,mumbai
cse,2019,1,2,0,0
cse,2020,3,4,0,0
cse,2021,5,6,0,0
cse,2022,7,8,0,0
eee,2019,9,10,0,0
eee,2020,11,12,0,0
eee,2021,13,14,0,0
eee,2022,15,16,0,0


In [20]:
# Wide format -> single row for each data point

# Name - Height - Weight
# A    -   160  -  66
# B    -   182  -  87


# long format -> each data points have many rows

# Name - Attribute - Value
# A    - Height    - 160
# A    - Weight    - 66
# B    - Height    - 182
# B    - Weight    - 87

# both contain same data 

In [21]:
# melt -> convert wide format to long format 

pd.DataFrame({'cse':[120]}).melt()
pd.DataFrame({'cse':[120], 'ece':[100], 'mech':[50]}).melt(var_name = 'branch', value_name = 'num_students')
pd.DataFrame(
    {
        'branch':['cse','ece','mech'],
        '2020':[100,150,60],
        '2021':[120,130,80],
        '2022':[150,140,70]
    }
).melt(id_vars=['branch']) # id_vars -> which column remain fixed

pd.DataFrame(
    {
        'branch':['cse','ece','mech'],
        '2020':[100,150,60],
        '2021':[120,130,80],
        '2022':[150,140,70]
    }
).melt(id_vars = ['branch'], var_name = 'year', value_name='students')

Unnamed: 0,branch,year,students
0,cse,2020,100
1,ece,2020,150
2,mech,2020,60
3,cse,2021,120
4,ece,2021,130
5,mech,2021,80
6,cse,2022,150
7,ece,2022,140
8,mech,2022,70


In [22]:
death = pd.read_csv("./time_series_covid19_deaths_global.csv")
confirm = pd.read_csv("./time_series_covid19_confirmed_global.csv")

In [23]:
death.shape
death.head() # wide format
confirm.head() # wide format

Unnamed: 0,Province/State,Country/Region,Lat,Long,1/22/20,1/23/20,1/24/20,1/25/20,1/26/20,1/27/20,...,12/24/22,12/25/22,12/26/22,12/27/22,12/28/22,12/29/22,12/30/22,12/31/22,1/1/23,1/2/23
0,,Afghanistan,33.93911,67.709953,0,0,0,0,0,0,...,207310,207399,207438,207460,207493,207511,207550,207559,207616,207627
1,,Albania,41.1533,20.1683,0,0,0,0,0,0,...,333749,333749,333751,333751,333776,333776,333806,333806,333811,333812
2,,Algeria,28.0339,1.6596,0,0,0,0,0,0,...,271194,271198,271198,271202,271208,271217,271223,271228,271229,271229
3,,Andorra,42.5063,1.5218,0,0,0,0,0,0,...,47686,47686,47686,47686,47751,47751,47751,47751,47751,47751
4,,Angola,-11.2027,17.8739,0,0,0,0,0,0,...,104973,104973,104973,105095,105095,105095,105095,105095,105095,105095


In [24]:
# convert wide -> long format

death1 = death.melt(id_vars = ['Province/State', 'Country/Region','Lat','Long'])
death1 = death.melt(id_vars = ['Province/State', 'Country/Region','Lat','Long'], var_name = 'date', value_name = 'num_deaths')

confirm1 = confirm.melt(id_vars = ['Province/State','Country/Region','Lat','Long'])
confirm1 = confirm.melt(id_vars = ['Province/State','Country/Region','Lat','Long'], var_name = 'date', value_name = 'num_cases')
confirm1

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,num_cases
0,,Afghanistan,33.939110,67.709953,1/22/20,0
1,,Albania,41.153300,20.168300,1/22/20,0
2,,Algeria,28.033900,1.659600,1/22/20,0
3,,Andorra,42.506300,1.521800,1/22/20,0
4,,Angola,-11.202700,17.873900,1/22/20,0
...,...,...,...,...,...,...
311248,,West Bank and Gaza,31.952200,35.233200,1/2/23,703228
311249,,Winter Olympics 2022,39.904200,116.407400,1/2/23,535
311250,,Yemen,15.552727,48.516388,1/2/23,11945
311251,,Zambia,-13.133897,27.849332,1/2/23,334661


In [25]:
death1.head()
confirm1.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,date,num_cases
0,,Afghanistan,33.93911,67.709953,1/22/20,0
1,,Albania,41.1533,20.1683,1/22/20,0
2,,Algeria,28.0339,1.6596,1/22/20,0
3,,Andorra,42.5063,1.5218,1/22/20,0
4,,Angola,-11.2027,17.8739,1/22/20,0


In [26]:
confirm1.merge(death1, on = ['Province/State', 'Country/Region', 'Lat', 'Long', 'date'])
confirm1.merge(death1, on = ['Province/State', 'Country/Region', 'Lat', 'Long', 'date'])[['Country/Region', 'date', 'num_cases', 'num_deaths']].sort_values('num_deaths', ascending = False).head(100)

Unnamed: 0,Country/Region,date,num_cases,num_deaths
311224,US,1/2/23,100759251,1092679
310935,US,1/1/23,100752629,1092674
310646,US,12/31/22,100749731,1092674
310357,US,12/30/22,100743442,1092661
310068,US,12/29/22,100706571,1092456
...,...,...,...,...
283769,US,9/29/22,96347560,1059500
283480,US,9/28/22,96263097,1058807
283191,US,9/27/22,96176208,1057502
282902,US,9/26/22,96115160,1057054
