# Pandas

In [1]:
import pandas as pd

## Pandas Series

#### >>> Cuman 1 kolom data ( 2 kolom dengan index-nya )

In [2]:
data = pd.Series([0.24,0.5,0.75,1.0],index = ['a','b','c','d'])

In [3]:
print(data)

a    0.24
b    0.50
c    0.75
d    1.00
dtype: float64


In [4]:
data.values

array([0.24, 0.5 , 0.75, 1.  ])

In [5]:
data.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [7]:
type(data)

pandas.core.series.Series

In [8]:
type(data.values)

numpy.ndarray

In [9]:
data['c'] = -0.75

In [10]:
print(data)

a    0.24
b    0.50
c   -0.75
d    1.00
dtype: float64


In [11]:
data['a':'c']

a    0.24
b    0.50
c   -0.75
dtype: float64

#### Untuk index yang kita buat sendiri , maka bagian end-nya bersifat inklusif

#### Untuk index yang bukan kita buat sendiri / index default , bagian end-nya bersifat eksklusif

In [12]:
data2 = pd.Series([1,2,3,4,5])

In [13]:
print(data2)

0    1
1    2
2    3
3    4
4    5
dtype: int64


In [14]:
data2[0:3]

0    1
1    2
2    3
dtype: int64

In [15]:
grades_dict = {'A':4,'-A':3.5,'B':3,'B-':2.5,'C':2}

In [16]:
data = pd.Series(grades_dict)

In [17]:
print(data)

A     4.0
-A    3.5
B     3.0
B-    2.5
C     2.0
dtype: float64


In [18]:
data.values

array([4. , 3.5, 3. , 2.5, 2. ])

In [19]:
marks_dict = {'A':85,'B':75,'C':65,'D':55}
marks = pd.Series(marks_dict)

In [20]:
print(marks)

A    85
B    75
C    65
D    55
dtype: int64


In [21]:
marks

A    85
B    75
C    65
D    55
dtype: int64

In [44]:
%timeit marks['A']

1.78 µs ± 16.6 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


In [45]:
%timeit marks[0:2]

11.9 µs ± 137 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## Pandas Data Frame

#### >>>> Bisa lebih dari 1 kolom data

In [26]:
grades_dict = {'A':4,'B':3.5,'C':3.0,'D':2.5}

In [27]:
dataframe = pd.DataFrame({'Grades':grades_dict,'Marks':marks_dict})

In [28]:
dataframe

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.5,75
C,3.0,65
D,2.5,55


In [29]:
dataframe.T

Unnamed: 0,A,B,C,D
Grades,4.0,3.5,3.0,2.5
Marks,85.0,75.0,65.0,55.0


In [30]:
dataframe.values

array([[ 4. , 85. ],
       [ 3.5, 75. ],
       [ 3. , 65. ],
       [ 2.5, 55. ]])

In [31]:
dataframe.values[2,0]

3.0

In [32]:
dataframe.columns

Index(['Grades', 'Marks'], dtype='object')

In [33]:
dataframe

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.5,75
C,3.0,65
D,2.5,55


In [67]:
dataframe['Scaled Marks'] = (dataframe['Marks'] / 100) * 90

In [68]:
dataframe

Unnamed: 0,Grades,Marks,Scaled Marks
A,4.0,85,76.5
B,3.51,75,67.5
C,3.0,65,58.5
D,2.5,55,49.5


In [69]:
del dataframe['Scaled Marks']

In [70]:
dataframe

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.51,75
C,3.0,65
D,2.5,55


In [39]:
selected_df = dataframe[dataframe['Marks']>70]

In [40]:
selected_df

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.5,75


In [87]:
dataframe['Grades']['A']

4.0

In [99]:
dataframe['A']['Grades'] # tidak bisa karena data framenya berbentuk dictionary
                         # pertama dia akan cari key 'A' tapi tidak ada karena yang ada cuman 'Grades' dan 'Marks'
                         # pakai loc untuk indexing seperti NumPy

KeyError: 'A'

In [None]:
dataframe

In [42]:
dataframe['Grades']['B'] = 3.51

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['Grades']['B'] = 3.51


In [43]:
dataframe

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.51,75
C,3.0,65
D,2.5,55


#### kayaknya tidak best practice

## Pandas NaN

In [48]:
data = pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

In [49]:
data

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [50]:
 data.fillna(0)

Unnamed: 0,a,b,c
0,1.0,2,0.0
1,0.0,3,4.0


In [60]:
data = pd.DataFrame([{'a':1,'b':2,'c':9},{'b':3,'c':4}])

In [61]:
data

Unnamed: 0,a,b,c
0,1.0,2,9
1,,3,4


In [59]:
help(data.dropna)

Help on method dropna in module pandas.core.frame:

dropna(axis: 'Axis' = 0, how: 'str' = 'any', thresh=None, subset: 'IndexLabel' = None, inplace: 'bool' = False) method of pandas.core.frame.DataFrame instance
    Remove missing values.
    
    See the :ref:`User Guide <missing_data>` for more on which values are
    considered missing, and how to work with missing data.
    
    Parameters
    ----------
    axis : {0 or 'index', 1 or 'columns'}, default 0
        Determine if rows or columns which contain missing values are
        removed.
    
        * 0, or 'index' : Drop rows which contain missing values.
        * 1, or 'columns' : Drop columns which contain missing value.
    
        .. versionchanged:: 1.0.0
    
           Pass tuple or list to drop on multiple axes.
           Only a single axis is allowed.
    
    how : {'any', 'all'}, default 'any'
        Determine if row or column is removed from DataFrame, when we have
        at least one NA or all NA.
    
      

In [62]:
data.dropna()

Unnamed: 0,a,b,c
0,1.0,2,9


## Pandas Indexing

### Pemakaian loc dan iloc mirip seperti indexing di NumPy

In [71]:
data = pd.Series(['a','b','c'],index = [1,3,5])

In [78]:
data[1] # explicit index, use loc

'a'

In [79]:
data[1:3] # implicit index, use iloc

3    b
5    c
dtype: object

In [81]:
data.loc[1:3] # explicit

1    a
3    b
dtype: object

In [82]:
data.iloc[1:3]

3    b
5    c
dtype: object

In [83]:
dataframe

Unnamed: 0,Grades,Marks
A,4.0,85
B,3.51,75
C,3.0,65
D,2.5,55


In [84]:
dataframe.iloc[2,:]

Grades     3.0
Marks     65.0
Name: C, dtype: float64

In [88]:
dataframe.iloc[::-1,:]

Unnamed: 0,Grades,Marks
D,2.5,55
C,3.0,65
B,3.51,75
A,4.0,85


In [93]:
dataframe.loc[1,:] #tidak bisa karena tidak ada explicit index yang dicari

KeyError: 1

In [98]:
dataframe.loc['Grades','A'] # tidak sesuai dengan indexing di NumPy

KeyError: 'A'

In [92]:
dataframe.loc['A','Grades']

4.0

In [94]:
dataframe_baru = dataframe.T

In [95]:
dataframe_baru

Unnamed: 0,A,B,C,D
Grades,4.0,3.51,3.0,2.5
Marks,85.0,75.0,65.0,55.0


In [96]:
dataframe_baru.loc['Grades','A']

4.0

## Read CSV

In [1]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("D:\\Pemrograman\\Jupyter Notebook\\Covid 19 Data set\\full_grouped.csv")

In [4]:
df

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered,WHO Region
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0,Eastern Mediterranean
1,2020-01-22,Albania,0,0,0,0,0,0,0,Europe
2,2020-01-22,Algeria,0,0,0,0,0,0,0,Africa
3,2020-01-22,Andorra,0,0,0,0,0,0,0,Europe
4,2020-01-22,Angola,0,0,0,0,0,0,0,Africa
...,...,...,...,...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791,152,2,0,Eastern Mediterranean
35152,2020-07-27,Western Sahara,10,1,8,1,0,0,0,Africa
35153,2020-07-27,Yemen,1691,483,833,375,10,4,36,Eastern Mediterranean
35154,2020-07-27,Zambia,4552,140,2815,1597,71,1,465,Africa


In [5]:
del df['WHO Region']

In [6]:
df

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active,New cases,New deaths,New recovered
0,2020-01-22,Afghanistan,0,0,0,0,0,0,0
1,2020-01-22,Albania,0,0,0,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0,0,0,0
4,2020-01-22,Angola,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791,152,2,0
35152,2020-07-27,Western Sahara,10,1,8,1,0,0,0
35153,2020-07-27,Yemen,1691,483,833,375,10,4,36
35154,2020-07-27,Zambia,4552,140,2815,1597,71,1,465


read this for df.drop (more clear than use help(func)) --> https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html

In [9]:
df.drop(columns = ['New deaths','New recovered','New cases'],inplace = True)

In [10]:
df

Unnamed: 0,Date,Country/Region,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791
35152,2020-07-27,Western Sahara,10,1,8,1
35153,2020-07-27,Yemen,1691,483,833,375
35154,2020-07-27,Zambia,4552,140,2815,1597


In [11]:
df.rename(columns = {'Country/Region':'Country'},inplace = True)

In [12]:
df

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791
35152,2020-07-27,Western Sahara,10,1,8,1
35153,2020-07-27,Yemen,1691,483,833,375
35154,2020-07-27,Zambia,4552,140,2815,1597


In [13]:
df['Date'] = pd.to_datetime(df['Date'])

In [14]:
df

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791
35152,2020-07-27,Western Sahara,10,1,8,1
35153,2020-07-27,Yemen,1691,483,833,375
35154,2020-07-27,Zambia,4552,140,2815,1597


In [15]:
df.head(20)

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
5,2020-01-22,Antigua and Barbuda,0,0,0,0
6,2020-01-22,Argentina,0,0,0,0
7,2020-01-22,Armenia,0,0,0,0
8,2020-01-22,Australia,0,0,0,0
9,2020-01-22,Austria,0,0,0,0


In [17]:
df.describe()

Unnamed: 0,Confirmed,Deaths,Recovered,Active
count,35156.0,35156.0,35156.0,35156.0
mean,23566.63,1234.068239,11048.13,11284.43
std,149981.8,7437.238354,64546.4,89971.49
min,0.0,0.0,0.0,-2.0
25%,1.0,0.0,0.0,0.0
50%,250.0,4.0,33.0,85.0
75%,3640.25,78.25,1286.25,1454.0
max,4290259.0,148011.0,1846641.0,2816444.0


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35156 entries, 0 to 35155
Data columns (total 6 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Date       35156 non-null  datetime64[ns]
 1   Country    35156 non-null  object        
 2   Confirmed  35156 non-null  int64         
 3   Deaths     35156 non-null  int64         
 4   Recovered  35156 non-null  int64         
 5   Active     35156 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(1)
memory usage: 1.6+ MB


In [19]:
df

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...,...
35151,2020-07-27,West Bank and Gaza,10621,78,3752,6791
35152,2020-07-27,Western Sahara,10,1,8,1
35153,2020-07-27,Yemen,1691,483,833,375
35154,2020-07-27,Zambia,4552,140,2815,1597


In [20]:
df.head(10)

Unnamed: 0,Date,Country,Confirmed,Deaths,Recovered,Active
0,2020-01-22,Afghanistan,0,0,0,0
1,2020-01-22,Albania,0,0,0,0
2,2020-01-22,Algeria,0,0,0,0
3,2020-01-22,Andorra,0,0,0,0
4,2020-01-22,Angola,0,0,0,0
5,2020-01-22,Antigua and Barbuda,0,0,0,0
6,2020-01-22,Argentina,0,0,0,0
7,2020-01-22,Armenia,0,0,0,0
8,2020-01-22,Australia,0,0,0,0
9,2020-01-22,Austria,0,0,0,0


In [35]:
df_country = df.groupby(['Country'])[['Country','Confirmed','Deaths','Recovered']].sum()

In [36]:
df_country

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1936390,49098,798240
Albania,196702,5708,118877
Algeria,1179755,77972,755897
Andorra,94404,5423,69074
Angola,22662,1078,6573
...,...,...,...
West Bank and Gaza,233461,1370,61124
Western Sahara,901,63,648
Yemen,67180,17707,23779
Zambia,129421,2643,83611


In [29]:
df_country.head(15)

Unnamed: 0_level_0,Confirmed,Deaths,Recovered
Country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,1936390,49098,798240
Albania,196702,5708,118877
Algeria,1179755,77972,755897
Andorra,94404,5423,69074
Angola,22662,1078,6573
Antigua and Barbuda,4487,326,2600
Argentina,4450658,97749,1680024
Armenia,1587173,27089,857482
Australia,960247,11387,711928
Austria,2034986,71390,1638380


In [30]:
df_country_date = df.groupby(['Date','Country']).sum()

In [31]:
df_country_date

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered,Active
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,Afghanistan,0,0,0,0
2020-01-22,Albania,0,0,0,0
2020-01-22,Algeria,0,0,0,0
2020-01-22,Andorra,0,0,0,0
2020-01-22,Angola,0,0,0,0
...,...,...,...,...,...
2020-07-27,West Bank and Gaza,10621,78,3752,6791
2020-07-27,Western Sahara,10,1,8,1
2020-07-27,Yemen,1691,483,833,375
2020-07-27,Zambia,4552,140,2815,1597


In [38]:
df_country = df_country.reset_index()

In [39]:
df_country

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Afghanistan,1936390,49098,798240
1,Albania,196702,5708,118877
2,Algeria,1179755,77972,755897
3,Andorra,94404,5423,69074
4,Angola,22662,1078,6573
...,...,...,...,...
182,West Bank and Gaza,233461,1370,61124
183,Western Sahara,901,63,648
184,Yemen,67180,17707,23779
185,Zambia,129421,2643,83611


In [40]:
df_country.head(15)

Unnamed: 0,Country,Confirmed,Deaths,Recovered
0,Afghanistan,1936390,49098,798240
1,Albania,196702,5708,118877
2,Algeria,1179755,77972,755897
3,Andorra,94404,5423,69074
4,Angola,22662,1078,6573
5,Antigua and Barbuda,4487,326,2600
6,Argentina,4450658,97749,1680024
7,Armenia,1587173,27089,857482
8,Australia,960247,11387,711928
9,Austria,2034986,71390,1638380


In [41]:
df_country_date.head(15)

Unnamed: 0_level_0,Unnamed: 1_level_0,Confirmed,Deaths,Recovered,Active
Date,Country,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-01-22,Afghanistan,0,0,0,0
2020-01-22,Albania,0,0,0,0
2020-01-22,Algeria,0,0,0,0
2020-01-22,Andorra,0,0,0,0
2020-01-22,Angola,0,0,0,0
2020-01-22,Antigua and Barbuda,0,0,0,0
2020-01-22,Argentina,0,0,0,0
2020-01-22,Armenia,0,0,0,0
2020-01-22,Australia,0,0,0,0
2020-01-22,Austria,0,0,0,0


In [46]:
df3 = df_country[df_country['Deaths'] < 1000]

In [47]:
df3

Unnamed: 0,Country,Confirmed,Deaths,Recovered
5,Antigua and Barbuda,4487,326,2600
14,Barbados,10652,738,7444
17,Belize,2636,222,1618
19,Bhutan,4971,0,2838
22,Botswana,15306,120,2291
24,Brunei,18168,226,15262
27,Burma,25188,639,15476
28,Burundi,11351,106,7361
29,Cabo Verde,82732,854,38253
30,Cambodia,17079,0,13917
