In [1]:
import pandas as pd
from urllib.request import urlretrieve

In [2]:
italy_covid_url = 'https://gist.githubusercontent.com/aakashns/f6a004fa20c84fec53262f9a8bfee775/raw/f309558b1cf5103424cef58e2ecb8704dcd4d74c/italy-covid-daywise.csv'

urlretrieve(italy_covid_url, 'italy-covid-daywise.csv')

('italy-covid-daywise.csv', <http.client.HTTPMessage at 0x1eceb30cac8>)

In [3]:
data = pd.read_csv(
    'italy-covid-daywise.csv',
    dtype={
        'date': 'object',
        'new_cases': 'float32',
        'new_deaths': 'float32',
        'new_tests': 'float32'
    }
)

#Specifying dtype would reduce the memory usage

#### Below you can observe the significant drop in memory usage when dtype is specified

In [4]:
pd.read_csv(
    'italy-covid-daywise.csv',
    dtype={
        'date': 'object',
        'new_cases': 'float16',
        'new_deaths': 'float16',
        'new_tests': 'float16'
    }
).memory_usage(deep=True)

Index           128
date          16616
new_cases       496
new_deaths      496
new_tests       496
dtype: int64

In [5]:
pd.read_csv(
    'italy-covid-daywise.csv',
).memory_usage(deep=True)

Index           128
date          16616
new_cases      1984
new_deaths     1984
new_tests      1984
dtype: int64

In [6]:
data.head()

Unnamed: 0,date,new_cases,new_deaths,new_tests
0,2019-12-31,0.0,0.0,
1,2020-01-01,0.0,0.0,
2,2020-01-02,0.0,0.0,
3,2020-01-03,0.0,0.0,
4,2020-01-04,0.0,0.0,


In [7]:
data.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248 entries, 0 to 247
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   date        248 non-null    object 
 1   new_cases   248 non-null    float32
 2   new_deaths  248 non-null    float32
 3   new_tests   135 non-null    float32
dtypes: float32(3), object(1)
memory usage: 5.0+ KB


In [8]:
# Set the number of max columns or rows to be displayed

pd.options.display.max_columns = 10

In [9]:
data.memory_usage(deep=True)

Index           128
date          16616
new_cases       992
new_deaths      992
new_tests       992
dtype: int64

In [10]:
# Returns descriptive statistics of the data

data.describe(include='all')

Unnamed: 0,date,new_cases,new_deaths,new_tests
count,248,248.0,248.0,135.0
unique,248,,,
top,2019-12-31,,,
freq,1,,,
mean,,1094.818604,143.133072,31699.673828
std,,1554.508057,227.105545,11622.209961
min,,-148.0,-31.0,7841.0
25%,,123.0,3.0,25259.0
50%,,342.0,17.0,29545.0
75%,,1371.75,175.25,37711.0


In [11]:
# Return 5 records at random

data.sample(5)

Unnamed: 0,date,new_cases,new_deaths,new_tests
93,2020-04-02,4782.0,727.0,
177,2020-06-25,577.0,-31.0,29421.0
65,2020-03-05,587.0,27.0,
178,2020-06-26,296.0,34.0,28331.0
52,2020-02-21,0.0,0.0,


### Indexing data using `loc`, `iloc` and `at`

In [12]:
data.at[147, 'new_cases']

300.0

In [13]:
data.loc[147, :]

date          2020-05-26
new_cases          300.0
new_deaths          92.0
new_tests        33944.0
Name: 147, dtype: object

In [14]:
data.iloc[147, :]

date          2020-05-26
new_cases          300.0
new_deaths          92.0
new_tests        33944.0
Name: 147, dtype: object

#### Above we were able to use 147 for `loc` as well as `iloc` because there the index/label and integer position was equal to 144

In [15]:
new_data = data.sample(5).copy()
new_data

Unnamed: 0,date,new_cases,new_deaths,new_tests
226,2020-08-13,476.0,10.0,25629.0
219,2020-08-06,384.0,10.0,32169.0
54,2020-02-23,62.0,2.0,
193,2020-07-11,276.0,12.0,25449.0
243,2020-08-30,1444.0,1.0,53541.0


In [16]:
new_data.index

Int64Index([226, 219, 54, 193, 243], dtype='int64')

In [17]:
# We change index/label, so we will have to specify different values for loc and iloc

new_data.index = ['a', 'b', 'c', 'd', 'e']

In [18]:
new_data

Unnamed: 0,date,new_cases,new_deaths,new_tests
a,2020-08-13,476.0,10.0,25629.0
b,2020-08-06,384.0,10.0,32169.0
c,2020-02-23,62.0,2.0,
d,2020-07-11,276.0,12.0,25449.0
e,2020-08-30,1444.0,1.0,53541.0


#### `dataframe.loc['row_label', 'column_label']` 

In [19]:
new_data.loc['a', :]

date          2020-08-13
new_cases          476.0
new_deaths          10.0
new_tests        25629.0
Name: a, dtype: object

#### `dataframe.iloc['integer_position_of_row', 'integer_position_of_column']` 

In [20]:
new_data.iloc[0, 0]

'2020-08-13'

#### `dataframe.at['row_label', 'column_label']` 

In [21]:
new_data.at['a', 'new_cases']

476.0

In [22]:
new_data.head()

Unnamed: 0,date,new_cases,new_deaths,new_tests
a,2020-08-13,476.0,10.0,25629.0
b,2020-08-06,384.0,10.0,32169.0
c,2020-02-23,62.0,2.0,
d,2020-07-11,276.0,12.0,25449.0
e,2020-08-30,1444.0,1.0,53541.0


In [27]:
# Filter data and few columns without using `dataframe.loc`

data[data.new_cases>250][['new_cases', 'new_deaths']]

Unnamed: 0,new_cases,new_deaths
62,561.0,6.0
63,347.0,17.0
64,466.0,28.0
65,587.0,27.0
66,769.0,41.0
...,...,...
243,1444.0,1.0
244,1365.0,4.0
245,996.0,6.0
246,975.0,8.0


In [29]:
# Filter data and few columns using `dataframe.loc`

data.loc[data.new_cases>250, ['new_cases', 'new_deaths']]

Unnamed: 0,new_cases,new_deaths
62,561.0,6.0
63,347.0,17.0
64,466.0,28.0
65,587.0,27.0
66,769.0,41.0
...,...,...
243,1444.0,1.0
244,1365.0,4.0
245,996.0,6.0
246,975.0,8.0


In [30]:
# Check sum of null values across columns

data.isnull().sum()

date            0
new_cases       0
new_deaths      0
new_tests     113
dtype: int64

In [34]:
# Count of unique values across coulmns

data.nunique()

date          248
new_cases     188
new_deaths    129
new_tests     135
dtype: int64

In [37]:
# Lists all unique values in a column

data.new_tests.unique()

array([   nan,  7841., 28095., 44248., 37083., 95273., 38676., 24113.,
       26678., 37554., 38589., 41441., 43732., 31231., 27047., 22999.,
       32211., 37771., 13665., 45428., 36091., 31384., 25823., 39620.,
       37049., 41131., 39027., 40657., 33505., 26101., 40226., 38617.,
       40644., 42987., 42579., 34206., 20676., 33944., 37299., 39838.,
       38233., 36051., 28948., 18053., 25628., 20035., 27451., 40470.,
       34036., 27894., 16301., 32200., 37865., 32991., 37651., 32880.,
       29545., 17463., 27762., 33957., 32921., 28570., 29875., 24581.,
       16152., 23225., 30237., 29421., 28331., 29721., 21183., 15484.,
       28471., 29325., 29147., 25680., 28946., 21166., 13771., 22490.,
       28679., 29947., 27251., 25449., 23061., 14006., 24222., 28392.,
       28089., 28661., 27569., 20621., 14121., 23915., 29288., 33018.,
       28970., 28059., 25177., 19374., 25341., 30875., 33396., 29686.,
       31905., 24496., 13467., 23491., 29739., 32169., 30392., 26631.,
      

In [40]:
# Counts the occurence of values in a column

data.new_tests.value_counts()

7841.0     1
24222.0    1
25341.0    1
19374.0    1
25177.0    1
          ..
38233.0    1
39838.0    1
37299.0    1
33944.0    1
54395.0    1
Name: new_tests, Length: 135, dtype: int64

In [64]:
d = {
    'A': [1, 2, 3, 4, 5, 6, 1, 2, 3, 4],
    'City': ['Pune', 'Mumbai', 'Mumbai', 'Delhi', 'Bangalore', 'Pune', 'Mumbai', 'Mumbai', 'Delhi', 'Goa'],
    'B' :['A', 'B', 'C', 'W', 'P', 'A', 'O', 'Q', 'E', 'V']
}

In [65]:
dummy_data = pd.DataFrame(d)

In [66]:
dummy_data

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
4,5,Bangalore,P
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E
9,4,Goa,V


### To find records which have city that occurs more than once - `'Mumbai', 'Pune', 'Delhi'`

In [78]:
a = dummy_data.City.value_counts() 
city_constraint_list = a[a > 1].index
city_constraint_list

Index(['Mumbai', 'Pune', 'Delhi'], dtype='object')

In [77]:
dummy_data[dummy_data.City.isin(city_constraint_list)]

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E


In [98]:
dummy_data[dummy_data.groupby('City')['A'].transform('count')>1]

Unnamed: 0,A,City,B
0,1,Pune,A
1,2,Mumbai,B
2,3,Mumbai,C
3,4,Delhi,W
5,6,Pune,A
6,1,Mumbai,O
7,2,Mumbai,Q
8,3,Delhi,E


In [103]:
dummy_data[dummy_data['City'] > 2]

TypeError: '>' not supported between instances of 'str' and 'int'