<a href="https://colab.research.google.com/github/pchroscicki/data-science-bootcamp/blob/main/02_data_analysis/01_pandas_intro.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Pandas**

Pandas website: https://pandas.pydata.org/

Documentation: https://pandas.pydata.org/docs/

In [None]:
# installation
pip install pandas

In [None]:
import pandas as pd
pd.__version__

'1.3.5'

# Basic data structure: pd.Series

In [None]:
s = pd.Series(data=[3, 2, 4, 6], name='sample')
s

0    3
1    2
2    4
3    6
Name: sample, dtype: int64

In [None]:
# to provide alternative index values
s = pd.Series(data=[3, 2, 4, 6], index=['a', 'b', 'c', 'd'], name='sample')
s

a    3
b    2
c    4
d    6
Name: sample, dtype: int64

In [None]:
# to provide missing data
import numpy as np
s = pd.Series(data=[3, np.nan, 4, 6], index=['a', 'b', 'c', 'd'], name='sample')
s

a    3.0
b    NaN
c    4.0
d    6.0
Name: sample, dtype: float64

In [None]:
# bool dtype
s = pd.Series([True, False, True])
s

0     True
1    False
2     True
dtype: bool

In [18]:
# date as index
s = pd.Series(data=np.arange(10, 20), index=pd.date_range(start='20221018', periods=10))
s

2022-10-18    10
2022-10-19    11
2022-10-20    12
2022-10-21    13
2022-10-22    14
2022-10-23    15
2022-10-24    16
2022-10-25    17
2022-10-26    18
2022-10-27    19
Freq: D, dtype: int64

In [19]:
s.index

DatetimeIndex(['2022-10-18', '2022-10-19', '2022-10-20', '2022-10-21',
               '2022-10-22', '2022-10-23', '2022-10-24', '2022-10-25',
               '2022-10-26', '2022-10-27'],
              dtype='datetime64[ns]', freq='D')

In [20]:
list(s.index)

[Timestamp('2022-10-18 00:00:00', freq='D'),
 Timestamp('2022-10-19 00:00:00', freq='D'),
 Timestamp('2022-10-20 00:00:00', freq='D'),
 Timestamp('2022-10-21 00:00:00', freq='D'),
 Timestamp('2022-10-22 00:00:00', freq='D'),
 Timestamp('2022-10-23 00:00:00', freq='D'),
 Timestamp('2022-10-24 00:00:00', freq='D'),
 Timestamp('2022-10-25 00:00:00', freq='D'),
 Timestamp('2022-10-26 00:00:00', freq='D'),
 Timestamp('2022-10-27 00:00:00', freq='D')]

In [21]:
# object dtype
s = pd.Series(data=['python', 'java', 'php'], name='languages')
s

0    python
1      java
2       php
Name: languages, dtype: object

attributes:

In [None]:
type(s)

pandas.core.series.Series

In [17]:
s.index

RangeIndex(start=0, stop=3, step=1)

In [22]:
s.values

array(['python', 'java', 'php'], dtype=object)

In [23]:
s.dtypes

dtype('O')

In [24]:
s.shape

(3,)

In [25]:
# pd.Series from dictionary
price = pd.Series(data={'PGE': 100, 'Tauron': 80, 'Energa': 85})
price

PGE       100
Tauron     80
Energa     85
dtype: int64

In [28]:
# to get value
price['PGE']

100

In [30]:
# or
price[0]

100

basic functions:

In [35]:
# number of items
price.count()

3

In [36]:
# shows the frequency of our values in the price object
price.value_counts()

100    1
80     1
85     1
dtype: int64

In [37]:
price.sum()

265

In [38]:
price.min()

80

In [39]:
price.max()

100

In [40]:
price.std()

10.408329997330664

In [43]:
# the set of basic statistics
price.describe()

count      3.000000
mean      88.333333
std       10.408330
min       80.000000
25%       82.500000
50%       85.000000
75%       92.500000
max      100.000000
dtype: float64

In [48]:
price.nlargest(2)

PGE       100
Energa     85
dtype: int64

In [47]:
price.nsmallest(2)

Tauron    80
Energa    85
dtype: int64

In [49]:
price.rank()

PGE       3.0
Tauron    1.0
Energa    2.0
dtype: float64

In [50]:
price.sort_values()

Tauron     80
Energa     85
PGE       100
dtype: int64

In [51]:
price.sort_values(ascending=False)

PGE       100
Energa     85
Tauron     80
dtype: int64

In [52]:
# to convert data in our object
price.apply(lambda x: x * 3.8)

PGE       380.0
Tauron    304.0
Energa    323.0
dtype: float64

# Basic data structure: pd.DataFrame

In [53]:
df = pd.DataFrame(data=[22, 10, 32])
df

Unnamed: 0,0
0,22
1,10
2,32


In [55]:
df = pd.DataFrame(data=[22, 10, 32], index=['first', 'second', 'third'], columns=['col_1'])
df

Unnamed: 0,col_1
first,22
second,10
third,32


In [56]:
df = pd.DataFrame(data={'Spain': ['Madrid', 'Barcelona'],
                        'Germany': ['Berlin', 'Munich']})
df

Unnamed: 0,Spain,Germany
0,Madrid,Berlin
1,Barcelona,Munich


In [57]:
df = pd.DataFrame(data=[[11, 12, 10], [21, 22, 23]], index=['first', 'second'], columns=['col_1', 'col_2', 'col_3'])
df

Unnamed: 0,col_1,col_2,col_3
first,11,12,10
second,21,22,23


In [58]:
df.columns

Index(['col_1', 'col_2', 'col_3'], dtype='object')

In [59]:
df.index

Index(['first', 'second'], dtype='object')

In [60]:
df.values

array([[11, 12, 10],
       [21, 22, 23]])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2 entries, first to second
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   col_1   2 non-null      int64
 1   col_2   2 non-null      int64
 2   col_3   2 non-null      int64
dtypes: int64(3)
memory usage: 172.0+ bytes


In [62]:
df.describe()

Unnamed: 0,col_1,col_2,col_3
count,2.0,2.0,2.0
mean,16.0,17.0,16.5
std,7.071068,7.071068,9.192388
min,11.0,12.0,10.0
25%,13.5,14.5,13.25
50%,16.0,17.0,16.5
75%,18.5,19.5,19.75
max,21.0,22.0,23.0


In [63]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
col_1,2.0,16.0,7.071068,11.0,13.5,16.0,18.5,21.0
col_2,2.0,17.0,7.071068,12.0,14.5,17.0,19.5,22.0
col_3,2.0,16.5,9.192388,10.0,13.25,16.5,19.75,23.0


# Column selection

In [64]:
df

Unnamed: 0,col_1,col_2,col_3
first,11,12,10
second,21,22,23


In [65]:
df['col_1']

first     11
second    21
Name: col_1, dtype: int64

In [66]:
type(df['col_1'])

pandas.core.series.Series

In [67]:
df[['col_1']]

Unnamed: 0,col_1
first,11
second,21


In [68]:
type(df[['col_1']])

pandas.core.frame.DataFrame

In [69]:
df.columns

Index(['col_1', 'col_2', 'col_3'], dtype='object')

In [70]:
# to rename columns
df.columns = ['A', 'B', 'C']
df

Unnamed: 0,A,B,C
first,11,12,10
second,21,22,23


In [71]:
df.A

first     11
second    21
Name: A, dtype: int64

In [73]:
# to add a new column
df['D'] = df.A + df.B
df

Unnamed: 0,A,B,C,D
first,11,12,10,23
second,21,22,23,43


In [76]:
# to select a row
df.loc['first']

A    11
B    12
C    10
D    23
Name: first, dtype: int64

In [77]:
df.iloc[0]

A    11
B    12
C    10
D    23
Name: first, dtype: int64

In [80]:
# to select a cell
df.loc['first', 'B']

12

In [82]:
# to select a column with using 'loc' method
df.loc[:, 'B']

first     12
second    22
Name: B, dtype: int64