# Import pandas

In [1]:
import numpy as np
import pandas as pd

### Check version

In [2]:
print(pd.__version__)

2.3.3


# Pandas Series

- 1 dimensional data can be stored

- all elements of series have same data type

- one series is treated as a single column data

- series can have a name (just like column name)

- series can have index

## Creating Series

- with list : every element in list is one element in series


- with dictionary : key goes as index of series and value goes as element

### Create Series using list / tuple

In [3]:
# here default integer index is created
s1 = pd.Series([1, 3, 5, 6, 8])# check daa type of s1 : int
s2 =pd.Series([1, 3, 5, np.nan, 6, 8])# check daa type of s2 : float bcz of np.nan

In [4]:
s1

0    1
1    3
2    5
3    6
4    8
dtype: int64

In [5]:
s2

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [6]:
#all elements in a series can have
# ONLY single data type
s1 = pd.Series((1, 'IACSD', 5, np.nan, 6, 8))

s1

0        1
1    IACSD
2        5
3      NaN
4        6
5        8
dtype: object

###Create Series using dictionary

with dictionary : key goes as index of series and value goes as element

In [7]:
d= {1:'A',2:'B',3:'C',4:'D'}
s1 = pd.Series(d)
print(s1)
print("########################")
print("Another example")
d= {'A':500,'B':600,'C':700,'D':800}
s1 = pd.Series(d)
print(s1)

1    A
2    B
3    C
4    D
dtype: object
########################
Another example
A    500
B    600
C    700
D    800
dtype: int64


# Pandas Data Frame

- used for 2D data

- any tabular data can be handled using data frame

- can store data where every column has different data types

- efficient for column wise operations

- internally python stores dataframe like a dictionary

## create data frame



### Create data frame using list of list

In [8]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
df = pd.DataFrame(data,columns=['c1','c2','c3','c4','c5'])
print(df)
print("###########")
df = pd.DataFrame(data,columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=[101,102,103,104], columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=range(12,16), columns=list('ABCDE'))
print(df)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
   c1  c2  c3  c4   c5
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
    A   B   C   D    E
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
      A   B   C   D    E
101   1   2   3   4    5
102   4   5   6  10   11
103   7   8   9  23   34
104  10  11  12  99  100
###########
     A   B   C   D    E
12   1   2   3   4    5
13   4   5   6  10   11
14   7   8   9  23   34
15  10  11  12  99  100


In [9]:
# No broad casting ... special case
data = [[1,2,3,4,5,6],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
data = [[1],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)


    0   1   2   3    4    5
0   1   2   3   4    5  6.0
1   4   5   6  10   11  NaN
2   7   8   9  23   34  NaN
3  10  11  12  99  100  NaN
###########
    0     1     2     3      4
0   1   NaN   NaN   NaN    NaN
1   4   5.0   6.0  10.0   11.0
2   7   8.0   9.0  23.0   34.0
3  10  11.0  12.0  99.0  100.0


### Create data frame using dictionary

In [10]:
# in this dictionary one key will represent one column, and value of that key will
# contain all elements of that column
d= {'c1':[1,2,3],
    'c2':[77,88,99]}
df=pd.DataFrame(d)
print(df)

   c1  c2
0   1  77
1   2  88
2   3  99


In [11]:
df2 = pd.DataFrame(
    {
        "A": 1.0, # value 1 will broadcast to all rows of this column
        "B": pd.Timestamp("20220102"), # value here will broadcast to all rows of this column
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", # value "foo" will broadcast to all rows of this column
    }
)

print(df2)
print("#####################")
#The columns of the resulting DataFrame have different dtypes:
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2022-01-02  1.0  3   test  foo
1  1.0 2022-01-02  1.0  3  train  foo
2  1.0 2022-01-02  1.0  3   test  foo
3  1.0 2022-01-02  1.0  3  train  foo
#####################
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


In [12]:
#Creating a DataFrame by passing a NumPy array,
#with a datetime index using date_range()
#and labeled columns:

dates = pd.date_range("20230315",
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [13]:
from numpy.random import default_rng
rng = default_rng()

In [14]:
rng.standard_normal((6, 4) )

array([[-1.53652068, -0.93163762, -2.19135745, -1.81381809],
       [-0.20912472, -1.68798339, -0.16767259,  0.39464081],
       [ 0.94125667, -1.25166636, -1.56322143, -0.51123856],
       [ 0.88406939, -0.7899627 ,  0.53738516,  0.49638916],
       [-2.49775962,  0.42754357, -0.93798782, -1.27075309],
       [ 0.31445756, -0.50126069,  1.2087144 , -0.32706335]])

In [15]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,0.562813,-0.503511,-1.626071,-1.028033
2023-03-16,0.588759,0.231329,1.176012,0.708152
2023-03-17,0.874426,-0.696918,-0.266177,0.763773
2023-03-18,0.118405,0.330835,-1.180088,-0.108352
2023-03-19,-0.566701,0.127526,-0.674545,0.524382
2023-03-20,1.256688,-3.069458,-0.8586,-1.782999


## Head and Tail working

- df.head() : returns first 5 rows, if n is passed then first n rows

- df.tail() : returns last 5 rows, if n is passed then first n rows

In [16]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

# head
# get first 5 rows from dataframe
#
print(df.head())
print("#####################")
#tail
# get last 5 rows
# if n is passed then last n rows
print(df.tail(3))
print("#####################")
#index
print("Index values are ....")
print(df.index)
print("#####################")
#columns
print("columns values are ....")
print(df.columns)

                   A         B         C         D
2023-03-15 -0.324200  1.040187  0.616092 -0.372546
2023-03-16 -0.784709 -0.624334  1.746916 -0.341789
2023-03-17  1.776502 -1.615096  0.025708  2.026062
2023-03-18 -0.178123  1.456430  1.635170 -0.140263
2023-03-19  0.188204 -1.697725 -0.336002 -0.806778
#####################
                   A         B         C         D
2023-03-18 -0.178123  1.456430  1.635170 -0.140263
2023-03-19  0.188204 -1.697725 -0.336002 -0.806778
2023-03-20 -0.310428  1.409119  0.428665  1.847806
#####################
Index values are ....
DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')
#####################
columns values are ....
Index(['A', 'B', 'C', 'D'], dtype='object')


## Data frame to numpy array

DataFrame.to_numpy() gives a NumPy representation
of the underlying data.

DataFrame.to_numpy() does not include
the index or column labels in the output.

Note that this can be an expensive operation
when your DataFrame has columns with different data types,
which comes down to a fundamental difference between
pandas and NumPy

NumPy arrays have one dtype for the entire array,
while pandas DataFrames have one dtype per column.
When you call DataFrame.to_numpy(), pandas will find
the NumPy dtype that can hold all of the dtypes
in the DataFrame. This may end up being object,
which requires casting every value to a Python object.


For DataFrame of all floating-point values,
DataFrame.to_numpy() is fast  
Also it doesnâ€™t require copying data

In [17]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
n_arr = df.to_numpy()
print(type(n_arr))
print(n_arr)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
<class 'numpy.ndarray'>
[[  1   2   3   4   5]
 [  4   5   6  10  11]
 [  7   8   9  23  34]
 [ 10  11  12  99 100]]


# Descriptive Statistics *describe()* function

## Series Descriptive Statistics

In [18]:
s1= pd.Series([10,20,30,20,10])
s1.describe()

count     5.0000
mean     18.0000
std       8.3666
min      10.0000
25%      10.0000
50%      20.0000
75%      20.0000
max      30.0000
dtype: float64

## Dataframe Descriptive Statistics

In [19]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
# statistic summary of data
print(df.describe())
print("#####################")
print("Transpose of DataFrame")
# Here index and column names are swapped
df.T

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
               0          1          2          3           4
count   4.000000   4.000000   4.000000   4.000000    4.000000
mean    5.500000   6.500000   7.500000  34.000000   37.500000
std     3.872983   3.872983   3.872983  44.052998   43.500958
min     1.000000   2.000000   3.000000   4.000000    5.000000
25%     3.250000   4.250000   5.250000   8.500000    9.500000
50%     5.500000   6.500000   7.500000  16.500000   22.500000
75%     7.750000   8.750000   9.750000  42.000000   50.500000
max    10.000000  11.000000  12.000000  99.000000  100.000000
#####################
Transpose of DataFrame


Unnamed: 0,0,1,2,3
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12
3,4,10,23,99
4,5,11,34,100


# Sorting



## Sort Dataframe values using given column or columns

- sort_value()

In [20]:
data = [[1,5,3,4,34],[4,2,6,10,34],[7,8,9,23,15],[10,11,12,99,15]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
#or list of columns
df = df.sort_values(by="B")
print(df)
print("##############")
#or list of columns
df = df.sort_values(by=["B",'E'])
print(df)

      A   E   D   C   B
101   1   5   3   4  34
105   4   2   6  10  34
110   7   8   9  23  15
102  10  11  12  99  15
##############
      A   E   D   C   B
110   7   8   9  23  15
102  10  11  12  99  15
101   1   5   3   4  34
105   4   2   6  10  34
##############
      A   E   D   C   B
110   7   8   9  23  15
102  10  11  12  99  15
105   4   2   6  10  34
101   1   5   3   4  34


## Sort indexes by an axis

- sort_index()

- axis 0 is row direction

- axis 1 is column direction

In [21]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,0,1,2,3,4
0,1,2,3,4,5
1,4,5,6,10,11
2,7,8,9,23,34
3,10,11,12,99,100


## Sort column names of a dataframe

- sort_index()

In [22]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1, ascending=False)

    A   E   D   C    B
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,E,D,C,B,A
0,2,3,4,5,1
1,5,6,10,11,4
2,8,9,23,34,7
3,11,12,99,100,10


## Sort the row indexes of a data frame

- sort_index()

In [23]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=0")
df = df.sort_index(axis=0)
print(df)
print("##############")
print("Sort index on axis=0")
df =df.sort_index(axis=0, ascending=False)
print(df)

      A   E   D   C    B
101   1   2   3   4    5
105   4   5   6  10   11
110   7   8   9  23   34
102  10  11  12  99  100
##############
Sort index on axis=0
      A   E   D   C    B
101   1   2   3   4    5
102  10  11  12  99  100
105   4   5   6  10   11
110   7   8   9  23   34
##############
Sort index on axis=0
      A   E   D   C    B
110   7   8   9  23   34
105   4   5   6  10   11
102  10  11  12  99  100
101   1   2   3   4    5


# Practice

Q1 . Create a data frame from dictionary. Names of the columns are module names (3) and row labels (index) are roll nos (5). Enter data in following order.

Rollno(index)     SQL   Python   AA


101     

109

102

125

110

In [24]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25


Q2 Sort all columns by their name

Q3 Print the data frame in a way that all roll nos are sorted

Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.

In [25]:
#Q2 Sort all columns by their name
print(df.sort_index(axis=1))
#Q3 Print the data frame in a way that all roll nos are sorted
print(df.sort_index(axis=0))
#Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.
print(df.sort_values(by='Python', ascending=False))


     AA  Python  SQL
101  30      34   10
109  28      23   20
102  32      36   30
125  16      25   35
110  25      16   25
     SQL  Python  AA
101   10      34  30
102   30      36  32
109   20      23  28
110   25      16  25
125   35      25  16
     SQL  Python  AA
102   30      36  32
101   10      34  30
125   35      25  16
109   20      23  28
110   25      16  25


# Inplace parameter in Pandas functions

when inplace = True , then original data frame is changed. No copy is created / returned

when inplace = False , then copy is created of given data frame and copy is modified and returned

By default inplace = False



Q5 Sort all columns by their name for original dataframe. Dont create a copy.

Q6 Print the data frame in a way that all roll nos are sorted

Q7 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.


In [26]:
df.sort_index(axis=1, inplace =True)
print(df)
df.sort_index(axis=0, inplace =True)
print(df)
df.sort_values(by='Python', ascending=False, inplace = True)
print(df)

     AA  Python  SQL
101  30      34   10
109  28      23   20
102  32      36   30
125  16      25   35
110  25      16   25
     AA  Python  SQL
101  30      34   10
102  32      36   30
109  28      23   20
110  25      16   25
125  16      25   35
     AA  Python  SQL
102  32      36   30
101  30      34   10
125  16      25   35
109  28      23   20
110  25      16   25


# Accessing and Selecting Data from Dataframe

## Select a column

In [27]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Direct way df['A']")
print(df['A']) # note the data type  #df['B']
print("################")
print("loc : select column by name df.loc[:,'A']")
print(df.loc[:,'A'])  # df.loc[:,'B']
print("################")
print("iloc : select column by index df.iloc[:,0]")
print(df.iloc[:,0])

Direct way df['A']
2023-03-15    0.401310
2023-03-16    0.026383
2023-03-17   -1.219895
2023-03-18    0.957910
2023-03-19    0.517857
2023-03-20    1.140395
Freq: D, Name: A, dtype: float64
################
loc : select column by name df.loc[:,'A']
2023-03-15    0.401310
2023-03-16    0.026383
2023-03-17   -1.219895
2023-03-18    0.957910
2023-03-19    0.517857
2023-03-20    1.140395
Freq: D, Name: A, dtype: float64
################
iloc : select column by index df.iloc[:,0]
2023-03-15    0.401310
2023-03-16    0.026383
2023-03-17   -1.219895
2023-03-18    0.957910
2023-03-19    0.517857
2023-03-20    1.140395
Freq: D, Name: A, dtype: float64


## select a row by index

In [28]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("loc: select a single row using label of row df.loc['2023-03-15']")
print(df.loc["2023-03-15"]) #df.loc["2023-03-19"]
print("################")
print("iloc: select a single row using index of row df.iloc[0]")
print(df.iloc[0]) #df.iloc[4]

                   A         B         C         D
2023-03-15 -0.515695 -1.244544  0.883421 -0.605345
2023-03-16  0.930785 -0.072008 -0.393178  0.400362
2023-03-17  1.010474 -0.169413 -1.067375 -1.334982
2023-03-18 -0.877237  1.414073 -2.221529  0.812233
2023-03-19  0.391428  0.128013  0.473117 -1.101454
2023-03-20 -0.379679  0.874225 -1.417166  0.187040
loc: select a single row using label of row df.loc['2023-03-15']
A   -0.515695
B   -1.244544
C    0.883421
D   -0.605345
Name: 2023-03-15 00:00:00, dtype: float64
################
iloc: select a single row using index of row df.iloc[0]
A   -0.515695
B   -1.244544
C    0.883421
D   -0.605345
Name: 2023-03-15 00:00:00, dtype: float64


# Slicing OR Finding subset of data frame OR Selecting multiple rows / columns

## Select Multiple columns

- by name (label)

- by index



In [29]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Select columns B and C by name df[['B','C']]")
# print(df[['B','C']])
print(df[['C','B']])
print("##############")
print("loc : Select columns B and C by name df.loc[:,['B','C']]")
print(df.loc[:,['B','C']])
print("##############")
print("iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]")
print(df.iloc[:,1:3])
# print(df.iloc[:,0:4:2])
print("##############")
print("iloc : Select columns B and D by index (end is excluded) df.iloc[:,[1,3]]")
print(df.iloc[:,[1,3]])

                   A         B         C         D
2023-03-15 -0.609119 -2.436159  1.697707  0.098531
2023-03-16  1.595844 -0.963260  0.206932  0.073438
2023-03-17 -0.093983  0.311500  1.577698 -0.315126
2023-03-18 -0.194129  0.987571  0.773042 -1.072536
2023-03-19  0.675147 -0.866391  1.315413 -1.248997
2023-03-20  0.511274  0.108760  0.721966 -0.004692
Select columns B and C by name df[['B','C']]
                   C         B
2023-03-15  1.697707 -2.436159
2023-03-16  0.206932 -0.963260
2023-03-17  1.577698  0.311500
2023-03-18  0.773042  0.987571
2023-03-19  1.315413 -0.866391
2023-03-20  0.721966  0.108760
##############
loc : Select columns B and C by name df.loc[:,['B','C']]
                   B         C
2023-03-15 -2.436159  1.697707
2023-03-16 -0.963260  0.206932
2023-03-17  0.311500  1.577698
2023-03-18  0.987571  0.773042
2023-03-19 -0.866391  1.315413
2023-03-20  0.108760  0.721966
##############
iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]
     

## Select rows

- by index

- by label

In [30]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Selecting by label df['20230315':'20230318'] end is included")
print(df["20230315":"20230318"]) #df["20230318":"20230319"]
print("##############")
print("loc: Selecting by label df.loc['20230315':'20230318', ] end is included")
print(df.loc["20230315":"20230318",]) #df.loc["20230318":"20230319",]
print("##############")
print("Selecting by index df[0:3] end is excluded")
print(df[0:3]) #df[3:5]
print("##############")
print("iloc: Selecting by index df.iloc[0:3,] end is excluded")
print(df.iloc[0:3,]) #df.iloc[3:5,]

                   A         B         C         D
2023-03-15  0.145906 -0.070012 -0.556862 -0.611549
2023-03-16  0.176727 -1.154827 -0.011250 -0.237611
2023-03-17  1.114981 -1.043230  0.524718 -0.306524
2023-03-18 -0.460735  0.342816  0.779510  0.064327
2023-03-19 -1.459569  0.183549 -0.537587  2.468148
2023-03-20  1.553411 -0.764141 -0.866946 -1.634666
Selecting by label df['20230315':'20230318'] end is included
                   A         B         C         D
2023-03-15  0.145906 -0.070012 -0.556862 -0.611549
2023-03-16  0.176727 -1.154827 -0.011250 -0.237611
2023-03-17  1.114981 -1.043230  0.524718 -0.306524
2023-03-18 -0.460735  0.342816  0.779510  0.064327
##############
loc: Selecting by label df.loc['20230315':'20230318', ] end is included
                   A         B         C         D
2023-03-15  0.145906 -0.070012 -0.556862 -0.611549
2023-03-16  0.176727 -1.154827 -0.011250 -0.237611
2023-03-17  1.114981 -1.043230  0.524718 -0.306524
2023-03-18 -0.460735  0.342816  0.77

# loc

construct for slicing DataFrame

this construct allows to access slice / part of the dataframe based on labels of rows or columns

row labels are user defined index and column labels are column names

labels are always strings

While slicing using loc, end is included

In [31]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print("Select rows from date 15-03-2023 to 18-03-2023 \
and columns A & B df.loc['20230315':'20230318', ['A', 'B']]")

print(df.loc["20230315":"20230318", ["A", "B"]])
print(df.loc["20230315":"20230318", 'A':'B'])#'A':'C'
print("##############")

print("Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']")
print(df.loc[dates[0], 'A'])
print(df.loc["20230315", 'A'])

                   A         B         C         D
2023-03-15  1.215111  0.629724  0.514316 -1.259869
2023-03-16 -1.275910  0.794507 -0.577526  0.505288
2023-03-17  0.194388  0.593354  0.752407 -0.713596
2023-03-18 -1.756794  0.366831  3.103251  0.626787
2023-03-19  0.432924 -0.358352  0.018024  0.175535
2023-03-20  1.479065 -0.070153  1.869761 -2.150873
##############
Select rows from date 15-03-2023 to 18-03-2023 and columns A & B df.loc['20230315':'20230318', ['A', 'B']]
                   A         B
2023-03-15  1.215111  0.629724
2023-03-16 -1.275910  0.794507
2023-03-17  0.194388  0.593354
2023-03-18 -1.756794  0.366831
                   A         B
2023-03-15  1.215111  0.629724
2023-03-16 -1.275910  0.794507
2023-03-17  0.194388  0.593354
2023-03-18 -1.756794  0.366831
##############
Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']
1.2151106978100452
1.2151106978100452


# iloc

construct for slicing DataFrame

Selection by index (position)

uses rows number(always start from 0) and column number(always start from 0)

This returns a data frame / series

End is excluded

In [32]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.iloc[3:5, 0:2])
print(df.iloc[-3:, -3:]) #last three rows and last three columns
print(df.iloc[::2, ::2]) #alternate rows and alternate columns
print(df.iloc[::-1, ::-1]) #rows and columns in reverse order

                   A         B         C         D
2023-03-15 -2.001431 -0.834543 -0.063127  0.192228
2023-03-16 -1.473441  0.501312 -0.694549  0.915440
2023-03-17  0.003300  0.550876 -0.945166  0.479098
2023-03-18 -0.490522 -0.495192 -1.733299 -1.764600
2023-03-19 -0.877485  1.668735  1.769671  0.799279
2023-03-20  1.600098  2.284393  0.329841  0.179328
##############
                   A         B
2023-03-18 -0.490522 -0.495192
2023-03-19 -0.877485  1.668735
                   B         C         D
2023-03-18 -0.495192 -1.733299 -1.764600
2023-03-19  1.668735  1.769671  0.799279
2023-03-20  2.284393  0.329841  0.179328
                   A         C
2023-03-15 -2.001431 -0.063127
2023-03-17  0.003300 -0.945166
2023-03-19 -0.877485  1.769671
                   D         C         B         A
2023-03-20  0.179328  0.329841  2.284393  1.600098
2023-03-19  0.799279  1.769671  1.668735 -0.877485
2023-03-18 -1.764600 -1.733299 -0.495192 -0.490522
2023-03-17  0.479098 -0.945166  0.550876  0

# Access single cell in a efficient way

## at

returns a single value based on label

## iat

returns a single value based on index

In [33]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.at['2023-03-15', 'A'])
print("##############")
print(df.iat[0, 0])

                   A         B         C         D
2023-03-15 -0.691378 -0.465891 -0.486504  1.078791
2023-03-16 -0.212571 -1.168554 -0.876202 -0.843539
2023-03-17 -0.853226 -0.069243 -1.534521 -0.862674
2023-03-18  0.929985  0.049846  0.508821 -0.445217
2023-03-19  1.154250  1.218252 -0.388011  1.641213
2023-03-20  0.434539 -0.922524  0.480297 -0.394166
##############
-0.6913777439006802
##############
-0.6913777439006802


# Practice

On previously used dataframe perform slicing operations

Q1. Select rows of last 3 students  
- use loc
- use iloc

Q2. Select python marks of roll no 101 and 110
- use loc
- use iloc

Q3. Print the row of student who is 3rd highest in AA



In [34]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
df.sort_values(by='AA',ascending=False,inplace=True)
print(df)
print("#############")
print(df.iloc[2])
print("#############")
print(df.sort_values(by='AA',ascending=False).iloc[2])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
     SQL  Python  AA
102   30      36  32
101   10      34  30
109   20      23  28
110   25      16  25
125   35      25  16
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64


# Create shallow copy of data frame


In [35]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(id(df))

df2 = df.copy()
print(id(df2))

4580514240
4580512656


# Add new column
## Direct assignment


In [52]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
df['F']=10 # Works
print(df)
print("############")
df['G']=[10] # error
print(df)

                   A         B         C         D
2023-03-15  1.248195 -0.505563  0.426610  2.419116
2023-03-16 -1.122821 -0.577742 -0.578205  2.258088
2023-03-17  0.123904 -2.198121  1.246669  0.040219
2023-03-18 -1.040830  1.542680 -0.868927 -1.609849
2023-03-19 -0.044513  0.420112 -1.375499 -1.196354
2023-03-20 -1.131733 -0.007714 -0.290900 -0.414014
############
                   A         B         C         D      E
2023-03-15  1.248195 -0.505563  0.426610  2.419116    one
2023-03-16 -1.122821 -0.577742 -0.578205  2.258088    one
2023-03-17  0.123904 -2.198121  1.246669  0.040219    two
2023-03-18 -1.040830  1.542680 -0.868927 -1.609849  three
2023-03-19 -0.044513  0.420112 -1.375499 -1.196354   four
2023-03-20 -1.131733 -0.007714 -0.290900 -0.414014  three
############
                   A         B         C         D      E   F
2023-03-15  1.248195 -0.505563  0.426610  2.419116    one  10
2023-03-16 -1.122821 -0.577742 -0.578205  2.258088    one  10
2023-03-17  0.123904 -2.1

ValueError: Length of values (1) does not match length of index (6)

In [53]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
#Using Series object with index
s1 = pd.Series([1, 2, 3, 4, 5, 6],
               index=pd.date_range("20230316", periods=6))
print(s1)
print("############")
df["Z"] = s1
print(df)

                   A         B         C         D
2023-03-15 -2.463927 -0.823370 -0.412956 -0.042342
2023-03-16  0.712103 -0.883963  0.602754  0.307579
2023-03-17  0.366536 -1.071570  0.448037 -1.249096
2023-03-18 -0.729402  1.648112 -0.760186 -0.720723
2023-03-19  0.003523 -0.322434  2.263383 -0.707563
2023-03-20 -0.689136 -0.429605  1.471322 -1.176379
############
2023-03-16    1
2023-03-17    2
2023-03-18    3
2023-03-19    4
2023-03-20    5
2023-03-21    6
Freq: D, dtype: int64
############
                   A         B         C         D    Z
2023-03-15 -2.463927 -0.823370 -0.412956 -0.042342  NaN
2023-03-16  0.712103 -0.883963  0.602754  0.307579  1.0
2023-03-17  0.366536 -1.071570  0.448037 -1.249096  2.0
2023-03-18 -0.729402  1.648112 -0.760186 -0.720723  3.0
2023-03-19  0.003523 -0.322434  2.263383 -0.707563  4.0
2023-03-20 -0.689136 -0.429605  1.471322 -1.176379  5.0


# Boolean Indexing (Filtering)

In [54]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print(df[df["A"] > 0])
print("############")
print(df[[True,True,False,False,True,True]])

                   A         B         C         D
2023-03-15  1.418085  0.914961  0.546006 -0.806188
2023-03-16  1.578903  0.977268 -0.911725 -0.357493
2023-03-17 -0.635983 -0.925203  1.600282 -0.433291
2023-03-18 -1.454588  0.622632  1.033492  0.705314
2023-03-19 -0.915025  0.489329 -0.757812 -0.582653
2023-03-20  0.588862  0.096639  0.527037 -0.495472
############
                   A         B         C         D
2023-03-15  1.418085  0.914961  0.546006 -0.806188
2023-03-16  1.578903  0.977268 -0.911725 -0.357493
2023-03-20  0.588862  0.096639  0.527037 -0.495472
############
                   A         B         C         D
2023-03-15  1.418085  0.914961  0.546006 -0.806188
2023-03-16  1.578903  0.977268 -0.911725 -0.357493
2023-03-19 -0.915025  0.489329 -0.757812 -0.582653
2023-03-20  0.588862  0.096639  0.527037 -0.495472


In [55]:
print(df["A"] > 0)

2023-03-15     True
2023-03-16     True
2023-03-17    False
2023-03-18    False
2023-03-19    False
2023-03-20     True
Freq: D, Name: A, dtype: bool


# Select Cells by condition

- Cells which fulfill condition are returned as it is
- Cells which don't fulfill condition are given value NaN

In [56]:
print(df > 0)

                A      B      C      D
2023-03-15   True   True   True  False
2023-03-16   True   True  False  False
2023-03-17  False  False   True  False
2023-03-18  False   True   True   True
2023-03-19  False   True  False  False
2023-03-20   True   True   True  False


In [57]:
print(df[df > 0])

                   A         B         C         D
2023-03-15  1.418085  0.914961  0.546006       NaN
2023-03-16  1.578903  0.977268       NaN       NaN
2023-03-17       NaN       NaN  1.600282       NaN
2023-03-18       NaN  0.622632  1.033492  0.705314
2023-03-19       NaN  0.489329       NaN       NaN
2023-03-20  0.588862  0.096639  0.527037       NaN


# Practice selecting using condition ( Boolean Indexing)

In [58]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


## Using loc

In [59]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df.loc[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df.loc[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


In [60]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have failed in SQL or AA, display ONLY SQL and AA marks")
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16), ['SQL', 'AA'] ])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have failed in SQL or AA, display ONLY SQL and AA marks
     SQL  AA
101   10  30


# Filetering using multiple values

In [61]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
print(df[df["E"].isin(["two", "four"])])

                   A         B         C         D
2023-03-15  0.314316 -0.558767 -1.081031  0.956368
2023-03-16  0.282102 -1.142076  1.272702  0.376895
2023-03-17 -0.509361 -0.031946 -0.565356  0.363558
2023-03-18  0.207196  0.240176  1.061023  0.245718
2023-03-19  0.215928 -0.286981 -2.031291 -2.050522
2023-03-20 -1.113611  0.064240 -0.841268 -0.511979
############
                   A         B         C         D      E
2023-03-15  0.314316 -0.558767 -1.081031  0.956368    one
2023-03-16  0.282102 -1.142076  1.272702  0.376895    one
2023-03-17 -0.509361 -0.031946 -0.565356  0.363558    two
2023-03-18  0.207196  0.240176  1.061023  0.245718  three
2023-03-19  0.215928 -0.286981 -2.031291 -2.050522   four
2023-03-20 -1.113611  0.064240 -0.841268 -0.511979  three
############
                   A         B         C         D     E
2023-03-17 -0.509361 -0.031946 -0.565356  0.363558   two
2023-03-19  0.215928 -0.286981 -2.031291 -2.050522  four


In [62]:
df["E"].isin(["two", "four"])

2023-03-15    False
2023-03-16    False
2023-03-17     True
2023-03-18    False
2023-03-19     True
2023-03-20    False
Freq: D, Name: E, dtype: bool

In [63]:
("two", "four") in df["E"]

False

# Update single cell

###using label -> date and column "A"

In [64]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")

df.at['2023-03-15', "A"] = 0

print(df)

#using index -> first row and third column
df.iat[0, 2] = 0

print(df)

# Using Numpy array
df.loc[:, "D"] = np.array([5] * len(df))
df.loc[:, "D"] = [5] * len(df)
print(df)

# Update using condition
df2 = df.copy()
df2['E']=10
df2[df2 <= 0] = -df2

print(df2)



                   A         B         C         D
2023-03-15  1.030283  0.546669 -1.188593  0.777930
2023-03-16  0.922685  2.227579 -0.047921 -0.350980
2023-03-17 -1.293558 -1.682110 -0.562648  0.320173
2023-03-18  0.690959 -0.651137  1.150021 -0.705561
2023-03-19 -1.194543 -0.457459 -0.504465  0.782767
2023-03-20  0.210749 -0.228316  0.143067 -0.932730
############
                   A         B         C         D
2023-03-15  0.000000  0.546669 -1.188593  0.777930
2023-03-16  0.922685  2.227579 -0.047921 -0.350980
2023-03-17 -1.293558 -1.682110 -0.562648  0.320173
2023-03-18  0.690959 -0.651137  1.150021 -0.705561
2023-03-19 -1.194543 -0.457459 -0.504465  0.782767
2023-03-20  0.210749 -0.228316  0.143067 -0.932730
                   A         B         C         D
2023-03-15  0.000000  0.546669  0.000000  0.777930
2023-03-16  0.922685  2.227579 -0.047921 -0.350980
2023-03-17 -1.293558 -1.682110 -0.562648  0.320173
2023-03-18  0.690959 -0.651137  1.150021 -0.705561
2023-03-19 -1.1945

# Mean Median Mode of All columns



### When all columns are numbers ( continuous)

In [65]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print("Mean")
print(df.mean()) #df['A'].mean()
print("############")
print("Median")
print(df.median()) #df['B'].median()

                   A         B         C         D
2023-03-15  1.361364 -0.218172 -0.525638 -1.330914
2023-03-16 -0.165604  0.196295  0.172014 -0.385996
2023-03-17  0.646430  0.464159  0.307380 -1.242884
2023-03-18 -1.669446 -0.804492 -1.118109  0.490479
2023-03-19  0.141775 -0.423032 -1.029265 -0.939642
2023-03-20 -0.821680 -1.155767  1.474548  0.036547
############
Mean
A   -0.084527
B   -0.323501
C   -0.119845
D   -0.562068
dtype: float64
############
Median
A   -0.011915
B   -0.320602
C   -0.176812
D   -0.662819
dtype: float64


### When there is mix of categorical (String) and continuous ( number) columns

In [66]:
df1 = pd.DataFrame()
df1['cat1'] = ['A','B','B','B','C','D']
df1['cat2'] = [1,1,2,2,3,4]
df1['rno'] = [1,2,3,4,5,6]
df1['marks']= [23,34,39,16,10,25]
print(df1)
print("############")
print("Mean")
print(df1.iloc[:,2:].mean())
print("############")
print("Median")
print(df1.iloc[:,2:].median())
print("############")
print("Mode :: May return multiple values")
print(df1.mode())

  cat1  cat2  rno  marks
0    A     1    1     23
1    B     1    2     34
2    B     2    3     39
3    B     2    4     16
4    C     3    5     10
5    D     4    6     25
############
Mean
rno       3.5
marks    24.5
dtype: float64
############
Median
rno       3.5
marks    24.0
dtype: float64
############
Mode :: May return multiple values
  cat1  cat2  rno  marks
0    B   1.0    1     10
1  NaN   2.0    2     16
2  NaN   NaN    3     23
3  NaN   NaN    4     25
4  NaN   NaN    5     34
5  NaN   NaN    6     39
