# Pandas tutorials
This notebook explains pandas and its use in Data Science and beyond

visit **www.fao.org** for datasets...

- how to install libraries

In [1]:
#pip install pandas
#pip install numpy

- import libraries

In [2]:
import pandas as pd
import numpy as np

In [3]:
# Object creation
s= pd.Series([1,3,np.nan,5,7,8,9])
s

0    1.0
1    3.0
2    NaN
3    5.0
4    7.0
5    8.0
6    9.0
dtype: float64

In [4]:
# creating 6 dates range from 2013, Jan, 01
dates = pd.date_range("20130101", periods=6)
dates


DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
dates = pd.date_range("20130101", periods=6)
dates

df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list("ABCD")) # 6 rows, 4 colm, index= 1st coloumn
df

Unnamed: 0,A,B,C,D
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516


In [6]:
# using key,value

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [7]:
# using key,value

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20120110"), #10 jan, 2022
        "C": pd.Series(1, index=list(range(4)), dtype="float32"), # index=0,1,2,3
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["boy", "man", "boy", "man"]),
        "F": "males",
    }
)


df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2012-01-10,1.0,3,boy,males
1,1.0,2012-01-10,1.0,3,man,males
2,1.0,2012-01-10,1.0,3,boy,males
3,1.0,2012-01-10,1.0,3,man,males


In [8]:
## data types in this frame

df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20120110"), #10 jan, 2022
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["boy", "man", "boy", "man"]),
        "F": "males",
    }
)


df2
df2.dtypes # data types in this frame

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [9]:
df.head() # first 5

Unnamed: 0,A,B,C,D
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831


In [10]:
df.head(2) # first 2

Unnamed: 0,A,B,C,D
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971


In [11]:
df.tail() #last 5

Unnamed: 0,A,B,C,D
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516


In [12]:
df.tail(2) #last 2

Unnamed: 0,A,B,C,D
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516


In [13]:
# you can also open it form variables tab
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [14]:
# you can also open it form variables tab
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [15]:
df.to_numpy() # df data frame converted into array

array([[ 0.2424269 , -0.3970124 ,  1.82607112, -1.34545434],
       [ 0.74040245, -1.14652806, -0.74215771, -0.62297133],
       [-0.13570599,  1.04284705,  0.35051246,  1.21785623],
       [ 0.2744885 ,  1.80925192,  0.2772927 ,  0.45126801],
       [-1.49860428, -2.01921898, -0.55548594, -0.18883148],
       [-1.81242921, -0.63712622, -0.33077074,  1.2051605 ]])

In [16]:
df2.to_numpy() # df2 data frame converted into array

array([[1.0, Timestamp('2012-01-10 00:00:00'), 1.0, 3, 'boy', 'males'],
       [1.0, Timestamp('2012-01-10 00:00:00'), 1.0, 3, 'man', 'males'],
       [1.0, Timestamp('2012-01-10 00:00:00'), 1.0, 3, 'boy', 'males'],
       [1.0, Timestamp('2012-01-10 00:00:00'), 1.0, 3, 'man', 'males']],
      dtype=object)

In [17]:
df.describe() # mean , std, min, max, etc...

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.364904,-0.224631,0.137577,0.119505
std,1.042354,1.415167,0.936962,1.028352
min,-1.812429,-2.019219,-0.742158,-1.345454
25%,-1.15788,-1.019178,-0.499307,-0.514436
50%,0.05336,-0.517069,-0.026739,0.131218
75%,0.266473,0.682882,0.332208,1.016687
max,0.740402,1.809252,1.826071,1.217856


In [18]:
df2.T # data frame transpose

Unnamed: 0,0,1,2,3
A,1.0,1.0,1.0,1.0
B,2012-01-10 00:00:00,2012-01-10 00:00:00,2012-01-10 00:00:00,2012-01-10 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,boy,man,boy,man
F,males,males,males,males


In [19]:
#df.sort_index(axis=1, ascending=False) # sorting by an axis:
#df.sort_index(axis=0, ascending=False) # sorting by an axis:
#df.sort_index(axis=1, ascending=True) # sorting by an axis:
df.sort_index(axis=0, ascending=True) # sorting by an axis:

Unnamed: 0,A,B,C,D
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516


In [20]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268


In [21]:
#df.sort_values(by="B", ascending=True)
df.sort_values(by="B", ascending=False)

Unnamed: 0,A,B,C,D
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831


In [22]:
df["A"] # col A

2013-01-01    0.242427
2013-01-02    0.740402
2013-01-03   -0.135706
2013-01-04    0.274489
2013-01-05   -1.498604
2013-01-06   -1.812429
Freq: D, Name: A, dtype: float64

In [23]:
df["B"] # col B

2013-01-01   -0.397012
2013-01-02   -1.146528
2013-01-03    1.042847
2013-01-04    1.809252
2013-01-05   -2.019219
2013-01-06   -0.637126
Freq: D, Name: B, dtype: float64

In [24]:
df[0:6] #row wise selection

Unnamed: 0,A,B,C,D
2013-01-01,0.242427,-0.397012,1.826071,-1.345454
2013-01-02,0.740402,-1.146528,-0.742158,-0.622971
2013-01-03,-0.135706,1.042847,0.350512,1.217856
2013-01-04,0.274489,1.809252,0.277293,0.451268
2013-01-05,-1.498604,-2.019219,-0.555486,-0.188831
2013-01-06,-1.812429,-0.637126,-0.330771,1.20516


In [25]:
### Selection by label
# For getting a cross section using a label:

df.loc[dates[0]] # 1st Jan data

A    0.242427
B   -0.397012
C    1.826071
D   -1.345454
Name: 2013-01-01 00:00:00, dtype: float64

In [26]:
# Selecting on a multi-axis by label:

df.loc[:, ["A", "B"]] # all_rows:col A and B 

Unnamed: 0,A,B
2013-01-01,0.242427,-0.397012
2013-01-02,0.740402,-1.146528
2013-01-03,-0.135706,1.042847
2013-01-04,0.274489,1.809252
2013-01-05,-1.498604,-2.019219
2013-01-06,-1.812429,-0.637126


In [27]:
# Showing label slicing, both endpoints are included:

df.loc["20130102":"20130104", ["A", "B"]] # rows 02 to 04:col A and B ; ## for range you don't need Lists


Unnamed: 0,A,B
2013-01-02,0.740402,-1.146528
2013-01-03,-0.135706,1.042847
2013-01-04,0.274489,1.809252


In [28]:
# Showing label slicing, both endpoints are included:

df.loc[["20130102","20130104"], ["A", "B"]] # rows 02 and 04:col A and B 

Unnamed: 0,A,B
2013-01-02,0.740402,-1.146528
2013-01-04,0.274489,1.809252


In [29]:
# Showing label slicing, both endpoints are included:

df.loc[["20130102","20130104"], ["A", "B", "C"]] # rows 02 and 04:col A,B,C 

Unnamed: 0,A,B,C
2013-01-02,0.740402,-1.146528,-0.742158
2013-01-04,0.274489,1.809252,0.277293


In [31]:
df.loc["20130102", ["A", "B", "C"]] # rows : col A,B,C ### 1 date and 3 coloumns

A    0.740402
B   -1.146528
C   -0.742158
Name: 2013-01-02 00:00:00, dtype: float64

In [32]:
df.loc[["20130102"], ["A", "B", "C"]] # rows : col A,B,C ### 1 date and 3 coloumns

Unnamed: 0,A,B,C
2013-01-02,0.740402,-1.146528,-0.742158


- Selection by position

In [33]:
df.iloc[3] #Select via the position of the passed integers:

A    0.274489
B    1.809252
C    0.277293
D    0.451268
Name: 2013-01-04 00:00:00, dtype: float64