# Pandas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

Print Pandas version

In [2]:
print(pd.__version__)

2.2.3


In [3]:
A = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

Call the values

In [4]:
A.values

array([0.25, 0.5 , 0.75, 1.  ])

Call the index

In [5]:
A.index

Index(['a', 'b', 'c', 'd'], dtype='object')

Pandas uses NumPy

In [6]:
type(A.values)

numpy.ndarray

Access a value using the index

In [7]:
A['c']

np.float64(0.75)

### Slicing

In [8]:
A['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [9]:
grades_dict = {'A': 4,'A-': 3.5,'B': 3,'B-': 2.5,'B': 2}
grades = pd.Series(grades_dict)

In [10]:
grades

A     4.0
A-    3.5
B     2.0
B-    2.5
dtype: float64

In [11]:
marks_dict = {'A': 85,'A-':80,'B':75,'B-':70,'B': 65}
marks = pd.Series(marks_dict)

In [12]:
marks

A     85
A-    80
B     65
B-    70
dtype: int64

# Dataflare

In [13]:
marks

A     85
A-    80
B     65
B-    70
dtype: int64

In [14]:
grades

A     4.0
A-    3.5
B     2.0
B-    2.5
dtype: float64

In [15]:
D = pd.DataFrame({'marks': marks, 'grades': grades})
D

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5
B,65,2.0
B-,70,2.5


transpose dataframe

In [16]:
D.T

Unnamed: 0,A,A-,B,B-
marks,85.0,80.0,65.0,70.0
grades,4.0,3.5,2.0,2.5


In [17]:
D

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5
B,65,2.0
B-,70,2.5


In [18]:
D.values

array([[85. ,  4. ],
       [80. ,  3.5],
       [65. ,  2. ],
       [70. ,  2.5]])

Accessing specific values of the data frame.

In [19]:
D.values[2,0]

np.float64(65.0)

In [20]:
D.columns

Index(['marks', 'grades'], dtype='object')

In [21]:
D.index

Index(['A', 'A-', 'B', 'B-'], dtype='object')

Adding another column

In [22]:
D

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5
B,65,2.0
B-,70,2.5


In [23]:
D['ScaledMarks'] = D['marks'] / 10
D

Unnamed: 0,marks,grades,ScaledMarks
A,85,4.0,8.5
A-,80,3.5,8.0
B,65,2.0,6.5
B-,70,2.5,7.0


Deleting a column

In [24]:
del D['ScaledMarks']
D

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5
B,65,2.0
B-,70,2.5


### Pandas (Indexing)

In [25]:
G = D[D['marks'] > 70]
G

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5


### Pandas (NaN)

In [26]:
A = pd.DataFrame([{'a': 1, 'b': 2},{'b':3, 'C':4}, {'a': 5, 'b': 6, 'C': 7}])
A

Unnamed: 0,a,b,C
0,1.0,2,
1,,3,4.0
2,5.0,6,7.0


Fill Missing Velues

In [27]:
A.fillna(0)

Unnamed: 0,a,b,C
0,1.0,2,0.0
1,0.0,3,4.0
2,5.0,6,7.0


Drop Missing Value

In [28]:
A.dropna()

Unnamed: 0,a,b,C
2,5.0,6,7.0


#### Indexing confusion

In [29]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [30]:
data[1]

'a'

In [31]:
data[1:3]

3    b
5    c
dtype: object

Explicit index (loc)

In [32]:
data.loc[1:3]

1    a
3    b
dtype: object

Implicit index (iloc)

In [33]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [34]:
D

Unnamed: 0,marks,grades
A,85,4.0
A-,80,3.5
B,65,2.0
B-,70,2.5


In [35]:
D.iloc[2,:]

marks     65.0
grades     2.0
Name: B, dtype: float64

# csv files

In [36]:
# Install scikit-learn if not already installed
# %pip install scikit-learn

# Import SimpleImputer from sklearn
from sklearn.impute import SimpleImputer

In [37]:
df = pd.read_csv('covid_19_data.csv')

In [38]:
df.head(10)

Unnamed: 0,SNo,ObservationDate,Province/State,Country/Region,Last Update,Confirmed,Deaths,Recovered
0,1,01/22/2020,Anhui,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
1,2,01/22/2020,Beijing,Mainland China,1/22/2020 17:00,14.0,0.0,0.0
2,3,01/22/2020,Chongqing,Mainland China,1/22/2020 17:00,6.0,0.0,0.0
3,4,01/22/2020,Fujian,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
4,5,01/22/2020,Gansu,Mainland China,1/22/2020 17:00,0.0,0.0,0.0
5,6,01/22/2020,Guangdong,Mainland China,1/22/2020 17:00,26.0,0.0,0.0
6,7,01/22/2020,Guangxi,Mainland China,1/22/2020 17:00,2.0,0.0,0.0
7,8,01/22/2020,Guizhou,Mainland China,1/22/2020 17:00,1.0,0.0,0.0
8,9,01/22/2020,Hainan,Mainland China,1/22/2020 17:00,4.0,0.0,0.0
9,10,01/22/2020,Hebei,Mainland China,1/22/2020 17:00,1.0,0.0,0.0


Remove columns
- The inplace parameter is used to modify the original DataFrame.
- The axis parameter specifies whether to drop rows (0) or columns (1). In this case, we are dropping columns.

In [39]:
df.drop(["SNo", "Last Update"],axis=1,inplace=True) # The inplace parameter is used to modify the original DataFrame and the axis parameter specifies whether to drop rows (0) or columns (1). In this case, we are dropping columns.
df.head(10)

Unnamed: 0,ObservationDate,Province/State,Country/Region,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0
5,01/22/2020,Guangdong,Mainland China,26.0,0.0,0.0
6,01/22/2020,Guangxi,Mainland China,2.0,0.0,0.0
7,01/22/2020,Guizhou,Mainland China,1.0,0.0,0.0
8,01/22/2020,Hainan,Mainland China,4.0,0.0,0.0
9,01/22/2020,Hebei,Mainland China,1.0,0.0,0.0


Rename Columns

In [40]:
df.rename(columns={"ObservationDate":"Date","Province/State":"Province","Country/Region":"Country"},inplace=True)
df.head(10)

Unnamed: 0,Date,Province,Country,Confirmed,Deaths,Recovered
0,01/22/2020,Anhui,Mainland China,1.0,0.0,0.0
1,01/22/2020,Beijing,Mainland China,14.0,0.0,0.0
2,01/22/2020,Chongqing,Mainland China,6.0,0.0,0.0
3,01/22/2020,Fujian,Mainland China,1.0,0.0,0.0
4,01/22/2020,Gansu,Mainland China,0.0,0.0,0.0
5,01/22/2020,Guangdong,Mainland China,26.0,0.0,0.0
6,01/22/2020,Guangxi,Mainland China,2.0,0.0,0.0
7,01/22/2020,Guizhou,Mainland China,1.0,0.0,0.0
8,01/22/2020,Hainan,Mainland China,4.0,0.0,0.0
9,01/22/2020,Hebei,Mainland China,1.0,0.0,0.0


Change Date format

In [41]:
df['Date'] = pd.to_datetime(df['Date'])
df.head(10)

Unnamed: 0,Date,Province,Country,Confirmed,Deaths,Recovered
0,2020-01-22,Anhui,Mainland China,1.0,0.0,0.0
1,2020-01-22,Beijing,Mainland China,14.0,0.0,0.0
2,2020-01-22,Chongqing,Mainland China,6.0,0.0,0.0
3,2020-01-22,Fujian,Mainland China,1.0,0.0,0.0
4,2020-01-22,Gansu,Mainland China,0.0,0.0,0.0
5,2020-01-22,Guangdong,Mainland China,26.0,0.0,0.0
6,2020-01-22,Guangxi,Mainland China,2.0,0.0,0.0
7,2020-01-22,Guizhou,Mainland China,1.0,0.0,0.0
8,2020-01-22,Hainan,Mainland China,4.0,0.0,0.0
9,2020-01-22,Hebei,Mainland China,1.0,0.0,0.0


#### Describe

In [42]:
df.describe()

Unnamed: 0,Date,Confirmed,Deaths,Recovered
count,306429,306429.0,306429.0,306429.0
mean,2020-11-06 01:54:54.146441728,85670.91,2036.403268,50420.29
min,2020-01-22 00:00:00,-302844.0,-178.0,-854405.0
25%,2020-07-30 00:00:00,1042.0,13.0,11.0
50%,2020-11-10 00:00:00,10375.0,192.0,1751.0
75%,2021-02-18 00:00:00,50752.0,1322.0,20270.0
max,2021-05-29 00:00:00,5863138.0,112385.0,6399531.0
std,,277551.6,6410.938048,201512.4


#### Info

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       306429 non-null  datetime64[ns]
 1   Province   228326 non-null  object        
 2   Country    306429 non-null  object        
 3   Confirmed  306429 non-null  float64       
 4   Deaths     306429 non-null  float64       
 5   Recovered  306429 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 14.0+ MB


Get rid of non-null values

In [44]:
df.fillna("NA", inplace=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 306429 entries, 0 to 306428
Data columns (total 6 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   Date       306429 non-null  datetime64[ns]
 1   Province   306429 non-null  object        
 2   Country    306429 non-null  object        
 3   Confirmed  306429 non-null  float64       
 4   Deaths     306429 non-null  float64       
 5   Recovered  306429 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(2)
memory usage: 14.0+ MB


Access specific columns

In [46]:
df[["Deaths","Recovered"]].tail(10)

Unnamed: 0,Deaths,Recovered
306419,2.0,82.0
306420,2.0,331.0
306421,669.0,41650.0
306422,2797.0,0.0
306423,1586.0,58882.0
306424,2335.0,95289.0
306425,245.0,0.0
306426,1.0,1324.0
306427,1738.0,83790.0
306428,4252.0,0.0


Access one column

In [47]:
df.Deaths.tail(10)

306419       2.0
306420       2.0
306421     669.0
306422    2797.0
306423    1586.0
306424    2335.0
306425     245.0
306426       1.0
306427    1738.0
306428    4252.0
Name: Deaths, dtype: float64

Get Max value

In [48]:
df.Deaths.max()

np.float64(112385.0)

In [49]:
df.Deaths.min()

np.float64(-178.0)

## Grouping

By country

In [56]:
df2 = df.groupby('Country')[['Confirmed','Deaths','Recovered']].sum().reset_index()
df2

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Azerbaijan,1.0,0.0,0.0
1,"('St. Martin',)",2.0,0.0,0.0
2,Afghanistan,17026442.0,669075.0,13464399.0
3,Albania,19768869.0,375955.0,13945256.0
4,Algeria,27684358.0,834464.0,18959299.0
...,...,...,...,...
224,West Bank and Gaza,41819444.0,440378.0,37003116.0
225,Yemen,962066.0,237613.0,506523.0
226,Zambia,13493953.0,205990.0,12625626.0
227,Zimbabwe,6484581.0,237234.0,5594887.0


Group by Country and Date

In [61]:
df2 = df.groupby(['Country','Date'])[['Confirmed','Deaths','Recovered']].sum().reset_index()
df2

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
0,Azerbaijan,2020-02-28,1.0,0.0,0.0
1,"('St. Martin',)",2020-03-10,2.0,0.0,0.0
2,Afghanistan,2020-02-24,1.0,0.0,0.0
3,Afghanistan,2020-02-25,1.0,0.0,0.0
4,Afghanistan,2020-02-26,1.0,0.0,0.0
...,...,...,...,...,...
87276,occupied Palestinian territory,2020-03-12,0.0,0.0,0.0
87277,occupied Palestinian territory,2020-03-14,0.0,0.0,0.0
87278,occupied Palestinian territory,2020-03-15,0.0,0.0,0.0
87279,occupied Palestinian territory,2020-03-16,0.0,0.0,0.0


Search for all records > 100

In [63]:
df3 = df2[df2['Confirmed'] > 100]
df3

Unnamed: 0,Country,Date,Confirmed,Deaths,Recovered
35,Afghanistan,2020-03-28,107.0,4.0,2.0
36,Afghanistan,2020-03-29,118.0,4.0,2.0
37,Afghanistan,2020-03-30,146.0,4.0,2.0
38,Afghanistan,2020-03-31,175.0,4.0,5.0
39,Afghanistan,2020-04-01,197.0,4.0,5.0
...,...,...,...,...,...
87269,Zimbabwe,2021-05-25,38706.0,1587.0,36517.0
87270,Zimbabwe,2021-05-26,38819.0,1589.0,36531.0
87271,Zimbabwe,2021-05-27,38854.0,1592.0,36541.0
87272,Zimbabwe,2021-05-28,38918.0,1592.0,36563.0


All records for Colombia and Deaths > 0

In [69]:
df4 = df[(df['Country'] == 'Colombia') & (df['Deaths'] > 0)]
df4

Unnamed: 0,Date,Province,Country,Confirmed,Deaths,Recovered
7767,2020-03-22,,Colombia,231.0,2.0,3.0
8071,2020-03-23,,Colombia,277.0,3.0,3.0
8370,2020-03-24,,Colombia,378.0,3.0,6.0
8673,2020-03-25,,Colombia,470.0,4.0,8.0
8980,2020-03-26,,Colombia,491.0,6.0,8.0
...,...,...,...,...,...,...
306323,2021-05-29,Sucre,Colombia,34809.0,1147.0,32259.0
306346,2021-05-29,Tolima,Colombia,75296.0,2432.0,71624.0
306380,2021-05-29,Valle del Cauca,Colombia,266414.0,8461.0,251388.0
306387,2021-05-29,Vaupes,Colombia,1528.0,14.0,1481.0
