# Import pandas

In [None]:
import numpy as np
import pandas as pd

### Check version

In [None]:
print(pd.__version__)

2.2.2


# Pandas Series

- 1 dimensional data can be stored

- all elements of series have same data type

- one series is treated as a single column data

- series can have a name (just like column name)

- series can have index

## Creating Series

- with list : every element in list is one element in series


- with dictionary : key goes as index of series and value goes as element

### Create Series using list / tuple

In [None]:
# here default integer index is created
s1 = pd.Series([1, 3, 5, 6, 8])# check daa type of s1 : int
s2 =pd.Series([1, 3, 5, np.nan, 6, 8])# check daa type of s2 : float bcz of np.nan

In [None]:
s1

Unnamed: 0,0
0,1
1,3
2,5
3,6
4,8


In [None]:
s2

Unnamed: 0,0
0,1.0
1,3.0
2,5.0
3,
4,6.0
5,8.0


In [None]:
#all elements in a series can have
# ONLY single data type
s1 = pd.Series((1, 'IACSD', 5, np.nan, 6, 8))

s1

Unnamed: 0,0
0,1
1,IACSD
2,5
3,
4,6
5,8


###Create Series using dictionary

with dictionary : key goes as index of series and value goes as element

In [None]:
d= {1:'A',2:'B',3:'C',4:'D'}
s1 = pd.Series(d)
print(s1)
print("########################")
print("Another example")
d= {'A':500,'B':600,'C':700,'D':800}
s1 = pd.Series(d)
print(s1)

1    A
2    B
3    C
4    D
dtype: object
########################
Another example
A    500
B    600
C    700
D    800
dtype: int64


# Pandas Data Frame

- used for 2D data

- any tabular data can be handled using data frame

- can store data where every column has different data types

- efficient for column wise operations

- internally python stores dataframe like a dictionary

## create data frame



### Create data frame using list of list

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
df = pd.DataFrame(data,columns=['c1','c2','c3','c4','c5'])
print(df)
print("###########")
df = pd.DataFrame(data,columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=[101,102,103,104], columns=list('ABCDE'))
print(df)
print("###########")
df = pd.DataFrame(data,index=range(12,16), columns=list('ABCDE'))
print(df)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
   c1  c2  c3  c4   c5
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
    A   B   C   D    E
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
###########
      A   B   C   D    E
101   1   2   3   4    5
102   4   5   6  10   11
103   7   8   9  23   34
104  10  11  12  99  100
###########
     A   B   C   D    E
12   1   2   3   4    5
13   4   5   6  10   11
14   7   8   9  23   34
15  10  11  12  99  100


In [None]:
# No broad casting ... special case
data = [[1,2,3,4,5,6],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("###########")
data = [[1],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)


    0   1   2   3    4    5
0   1   2   3   4    5  6.0
1   4   5   6  10   11  NaN
2   7   8   9  23   34  NaN
3  10  11  12  99  100  NaN
###########
    0     1     2     3      4
0   1   NaN   NaN   NaN    NaN
1   4   5.0   6.0  10.0   11.0
2   7   8.0   9.0  23.0   34.0
3  10  11.0  12.0  99.0  100.0


### Create data frame using dictionary

In [None]:
# in this dictionary one key will represent one column, and value of that key will
# contain all elements of that column
d= {'c1':[1,2,3],
    'c2':[77,88,99]}
df=pd.DataFrame(d)
print(df)

   c1  c2
0   1  77
1   2  88
2   3  99


In [None]:
df2 = pd.DataFrame(
    {
        "A": 1.0, # value 1 will broadcast to all rows of this column
        "B": pd.Timestamp("20220102"), # value here will broadcast to all rows of this column
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo", # value "foo" will broadcast to all rows of this column
    }
)

print(df2)
print("#####################")
#The columns of the resulting DataFrame have different dtypes:
print(df2.dtypes)

     A          B    C  D      E    F
0  1.0 2022-01-02  1.0  3   test  foo
1  1.0 2022-01-02  1.0  3  train  foo
2  1.0 2022-01-02  1.0  3   test  foo
3  1.0 2022-01-02  1.0  3  train  foo
#####################
A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object


In [None]:
#Creating a DataFrame by passing a NumPy array,
#with a datetime index using date_range()
#and labeled columns:

dates = pd.date_range("20230315",
                      periods=6)

dates

DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')

In [None]:
from numpy.random import default_rng
rng = default_rng()

In [None]:
rng.standard_normal((6, 4) )

array([[-1.11995051, -0.53838898, -1.04681858,  1.22578911],
       [ 0.70869599, -1.67609742, -0.44793587,  1.14303425],
       [ 1.61176757, -0.76983725,  0.03680904, -0.90170292],
       [ 0.2764417 ,  1.27015469,  0.04127856,  0.47971645],
       [-0.13364471,  0.63615078,  0.39388305,  1.18684628],
       [-1.49523011,  0.26520475,  0.43420157,  0.03455783]])

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

df

Unnamed: 0,A,B,C,D
2023-03-15,-0.773017,-0.952838,-1.33852,-1.882393
2023-03-16,-0.898021,-1.422006,-1.846542,1.535625
2023-03-17,2.253954,0.224243,0.363217,0.325554
2023-03-18,-1.273215,-0.455794,-0.450241,-2.622759
2023-03-19,0.272745,0.30402,-0.338413,-1.335398
2023-03-20,-1.516768,-1.037966,-1.467199,1.250175


## Head and Tail working

- df.head() : returns first 5 rows, if n is passed then first n rows

- df.tail() : returns last 5 rows, if n is passed then first n rows

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

# head
# get first 5 rows from dataframe
#
print(df.head())
print("#####################")
#tail
# get last 5 rows
# if n is passed then last n rows
print(df.tail(3))
print("#####################")
#index
print("Index values are ....")
print(df.index)
print("#####################")
#columns
print("columns values are ....")
print(df.columns)

                   A         B         C         D
2023-03-15 -0.734405 -0.233968  0.864200 -0.256050
2023-03-16 -0.567865 -1.183234 -0.399905  0.590984
2023-03-17  0.685723 -0.867329 -0.431074  0.000712
2023-03-18 -0.002965 -2.522733 -1.225694 -1.036687
2023-03-19 -1.524804 -1.152961  1.241059  0.793256
#####################
                   A         B         C         D
2023-03-18 -0.002965 -2.522733 -1.225694 -1.036687
2023-03-19 -1.524804 -1.152961  1.241059  0.793256
2023-03-20 -1.193338 -0.499026 -0.111536 -0.493964
#####################
Index values are ....
DatetimeIndex(['2023-03-15', '2023-03-16', '2023-03-17', '2023-03-18',
               '2023-03-19', '2023-03-20'],
              dtype='datetime64[ns]', freq='D')
#####################
columns values are ....
Index(['A', 'B', 'C', 'D'], dtype='object')


## Data frame to numpy array

DataFrame.to_numpy() gives a NumPy representation
of the underlying data.

DataFrame.to_numpy() does not include
the index or column labels in the output.

Note that this can be an expensive operation
when your DataFrame has columns with different data types,
which comes down to a fundamental difference between
pandas and NumPy

NumPy arrays have one dtype for the entire array,
while pandas DataFrames have one dtype per column.
When you call DataFrame.to_numpy(), pandas will find
the NumPy dtype that can hold all of the dtypes
in the DataFrame. This may end up being object,
which requires casting every value to a Python object.


For DataFrame of all floating-point values,
DataFrame.to_numpy() is fast  
Also it doesn’t require copying data

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
n_arr = df.to_numpy()
print(type(n_arr))
print(n_arr)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
<class 'numpy.ndarray'>
[[  1   2   3   4   5]
 [  4   5   6  10  11]
 [  7   8   9  23  34]
 [ 10  11  12  99 100]]


# Descriptive Statistics *describe()* function

## Series Descriptive Statistics

In [None]:
s1= pd.Series([10,20,30,20,10])
s1.describe()

count     5.0000
mean     18.0000
std       8.3666
min      10.0000
25%      10.0000
50%      20.0000
75%      20.0000
max      30.0000
dtype: float64

## Dataframe Descriptive Statistics

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("#####################")
# statistic summary of data
print(df.describe())
print("#####################")
print("Transpose of DataFrame")
# Here index and column names are swapped
df.T

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
#####################
               0          1          2          3           4
count   4.000000   4.000000   4.000000   4.000000    4.000000
mean    5.500000   6.500000   7.500000  34.000000   37.500000
std     3.872983   3.872983   3.872983  44.052998   43.500958
min     1.000000   2.000000   3.000000   4.000000    5.000000
25%     3.250000   4.250000   5.250000   8.500000    9.500000
50%     5.500000   6.500000   7.500000  16.500000   22.500000
75%     7.750000   8.750000   9.750000  42.000000   50.500000
max    10.000000  11.000000  12.000000  99.000000  100.000000
#####################
Transpose of DataFrame


Unnamed: 0,0,1,2,3
0,1,4,7,10
1,2,5,8,11
2,3,6,9,12
3,4,10,23,99
4,5,11,34,100


# Sorting



## Sort Dataframe values using given column or columns

- sort_value()

In [None]:
data = [[1,5,3,4,34],[4,2,6,10,34],[7,8,9,23,15],[10,11,12,99,15]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
#or list of columns
df = df.sort_values(by="B")
print(df)
print("##############")
#or list of columns
df = df.sort_values(by=["B",'E'])
print(df)

      A   E   D   C   B
101   1   5   3   4  34
105   4   2   6  10  34
110   7   8   9  23  15
102  10  11  12  99  15
##############
      A   E   D   C   B
110   7   8   9  23  15
102  10  11  12  99  15
101   1   5   3   4  34
105   4   2   6  10  34
##############
      A   E   D   C   B
110   7   8   9  23  15
102  10  11  12  99  15
105   4   2   6  10  34
101   1   5   3   4  34


## Sort indexes by an axis

- sort_index()

- axis 0 is row direction

- axis 1 is column direction

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data)
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1)

    0   1   2   3    4
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,4,3,2,1,0
0,5,4,3,2,1
1,11,10,6,5,4
2,34,23,9,8,7
3,100,99,12,11,10


## Sort column names of a dataframe

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=1")
df.sort_index(axis=1, ascending=False)

    A   E   D   C    B
0   1   2   3   4    5
1   4   5   6  10   11
2   7   8   9  23   34
3  10  11  12  99  100
##############
Sort index on axis=1


Unnamed: 0,E,D,C,B,A
0,2,3,4,5,1
1,5,6,10,11,4
2,8,9,23,34,7
3,11,12,99,100,10


## Sort the row indexes of a data frame

- sort_index()

In [None]:
data = [[1,2,3,4,5],[4,5,6,10,11],[7,8,9,23,34],[10,11,12,99,100]] #4 by 5
df = pd.DataFrame(data, index=[101,105,110,102], columns=['A','E','D','C','B'])
print(df)
print("##############")
print("Sort index on axis=0")
df = df.sort_index(axis=0)
print(df)
print("##############")
print("Sort index on axis=0")
df =df.sort_index(axis=0, ascending=False)
print(df)

      A   E   D   C    B
101   1   2   3   4    5
105   4   5   6  10   11
110   7   8   9  23   34
102  10  11  12  99  100
##############
Sort index on axis=0
      A   E   D   C    B
101   1   2   3   4    5
102  10  11  12  99  100
105   4   5   6  10   11
110   7   8   9  23   34
##############
Sort index on axis=0
      A   E   D   C    B
110   7   8   9  23   34
105   4   5   6  10   11
102  10  11  12  99  100
101   1   2   3   4    5


# Practice

Q1 . Create a data frame from dictionary. Names of the columns are module names (3) and row labels (index) are roll nos (5). Enter data in following order.

Rollno(index)     SQL   Python   AA


101     

109

102

125

110

In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25


Q2 Sort all columns by their name

Q3 Print the data frame in a way that all roll nos are sorted

Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.

In [None]:
#Q2 Sort all columns by their name
print(df.sort_index(axis=1))
#Q3 Print the data frame in a way that all roll nos are sorted
print(df.sort_index(axis=0))
#Q4 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.
print(df.sort_values(by='Python', ascending=False))


     AA  Python  SQL
101  30      34   10
109  28      23   20
102  32      36   30
125  16      25   35
110  25      16   25
     SQL  Python  AA
101   10      34  30
102   30      36  32
109   20      23  28
110   25      16  25
125   35      25  16
     SQL  Python  AA
102   30      36  32
101   10      34  30
125   35      25  16
109   20      23  28
110   25      16  25


# Inplace parameter in Pandas functions

when inplace = True , then original data frame is changed. No copy is created / returned

when inplace = False , then copy is created of given data frame and copy is modified and returned

By default inplace = False



Q5 Sort all columns by their name for original dataframe. Dont create a copy.

Q6 Print the data frame in a way that all roll nos are sorted

Q7 Print data frame in a way that, topper of python module comes as first row and then second topper and so on.


In [None]:
df.sort_index(axis=1, inplace =True)
print(df)
df.sort_index(axis=0, inplace =True)
print(df)
df.sort_values(by='Python', ascending=False, inplace = True)
print(df)

     AA  Python  SQL
101  30      34   10
102  32      36   30
109  28      23   20
110  25      16   25
125  16      25   35
     AA  Python  SQL
101  30      34   10
102  32      36   30
109  28      23   20
110  25      16   25
125  16      25   35
     AA  Python  SQL
102  32      36   30
101  30      34   10
125  16      25   35
109  28      23   20
110  25      16   25


# Accessing and Selecting Data from Dataframe

## Select a column

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print("Direct way df['A']")
print(df['A']) # note the data type  #df['B']
print("################")
print("loc : select column by name df.loc[:,'A']")
print(df.loc[:,'A'])  # df.loc[:,'B']
print("################")
print("iloc : select column by index df.iloc[:,0]")
print(df.iloc[:,0])

Direct way df['A']
2023-03-15   -1.974303
2023-03-16   -0.707547
2023-03-17   -0.731980
2023-03-18   -0.304451
2023-03-19    1.784891
2023-03-20    1.529478
Freq: D, Name: A, dtype: float64
################
loc : select column by name df.loc[:,'A']
2023-03-15   -1.974303
2023-03-16   -0.707547
2023-03-17   -0.731980
2023-03-18   -0.304451
2023-03-19    1.784891
2023-03-20    1.529478
Freq: D, Name: A, dtype: float64
################
iloc : select column by index df.iloc[:,0]
2023-03-15   -1.974303
2023-03-16   -0.707547
2023-03-17   -0.731980
2023-03-18   -0.304451
2023-03-19    1.784891
2023-03-20    1.529478
Freq: D, Name: A, dtype: float64


## select a row by index

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("loc: select a single row using label of row df.loc['2023-03-15']")
print(df.loc["2023-03-15"]) #df.loc["2023-03-19"]
print("################")
print("iloc: select a single row using index of row df.iloc[0]")
print(df.iloc[0]) #df.iloc[4]

                   A         B         C         D
2023-03-15  0.490366 -1.977146  0.520489  0.067270
2023-03-16 -0.961447 -0.258676  0.650839 -0.229323
2023-03-17 -1.250487 -0.448569  0.094181  0.242238
2023-03-18 -1.062208 -0.504535  0.970077  0.810959
2023-03-19  1.458703 -0.728796  0.438042  1.103970
2023-03-20 -0.182294 -0.084809  0.593151 -2.031427
loc: select a single row using label of row df.loc['2023-03-15']
A    0.490366
B   -1.977146
C    0.520489
D    0.067270
Name: 2023-03-15 00:00:00, dtype: float64
################
iloc: select a single row using index of row df.iloc[0]
A    0.490366
B   -1.977146
C    0.520489
D    0.067270
Name: 2023-03-15 00:00:00, dtype: float64


# Slicing OR Finding subset of data frame OR Selecting multiple rows / columns

## Select Multiple columns

- by name (label)

- by index



In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Select columns B and C by name df[['B','C']]")
# print(df[['B','C']])
print(df[['C','B']])
print("##############")
print("loc : Select columns B and C by name df.loc[:,['B','C']]")
print(df.loc[:,['B','C']])
print("##############")
print("iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]")
print(df.iloc[:,1:3])
# print(df.iloc[:,0:4:2])
print("##############")
print("iloc : Select columns B and D by index (end is excluded) df.iloc[:,[1,3]]")
print(df.iloc[:,[1,3]])

                   A         B         C         D
2023-03-15  0.645005 -0.303031 -1.684309  0.988249
2023-03-16  1.115271 -0.244842 -1.223847 -1.725761
2023-03-17  0.149822  0.919864 -2.556801 -0.690972
2023-03-18  0.289644  0.999481 -0.470858  0.973040
2023-03-19 -0.519457  0.033502 -0.006660 -0.321440
2023-03-20 -0.278491  1.153410 -0.535359 -1.029903
Select columns B and C by name df[['B','C']]
                   C         B
2023-03-15 -1.684309 -0.303031
2023-03-16 -1.223847 -0.244842
2023-03-17 -2.556801  0.919864
2023-03-18 -0.470858  0.999481
2023-03-19 -0.006660  0.033502
2023-03-20 -0.535359  1.153410
##############
loc : Select columns B and C by name df.loc[:,['B','C']]
                   B         C
2023-03-15 -0.303031 -1.684309
2023-03-16 -0.244842 -1.223847
2023-03-17  0.919864 -2.556801
2023-03-18  0.999481 -0.470858
2023-03-19  0.033502 -0.006660
2023-03-20  1.153410 -0.535359
##############
iloc : Select columns B and C by index (end is excluded) df.iloc[:,1:3]
     

## Select rows

- by index

- by label

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("Selecting by label df['20230315':'20230318'] end is included")
print(df["20230315":"20230318"]) #df["20230318":"20230319"]
print("##############")
print("loc: Selecting by label df.loc['20230315':'20230318', ] end is included")
print(df.loc["20230315":"20230318",]) #df.loc["20230318":"20230319",]
print("##############")
print("Selecting by index df[0:3] end is excluded")
print(df[0:3]) #df[3:5]
print("##############")
print("iloc: Selecting by index df.iloc[0:3,] end is excluded")
print(df.iloc[0:3,]) #df.iloc[3:5,]

                   A         B         C         D
2023-03-15  1.228965  0.621362 -0.333370 -0.231063
2023-03-16 -0.338434  2.228747  0.347402 -1.857681
2023-03-17  0.841055  1.094406 -0.803335 -0.719541
2023-03-18 -1.256443 -1.138627 -0.512212 -2.596556
2023-03-19 -1.845131  0.112687 -1.332939 -0.617029
2023-03-20  0.795890  0.755342 -0.572457  0.807755
Selecting by label df['20230315':'20230318'] end is included
                   A         B         C         D
2023-03-15  1.228965  0.621362 -0.333370 -0.231063
2023-03-16 -0.338434  2.228747  0.347402 -1.857681
2023-03-17  0.841055  1.094406 -0.803335 -0.719541
2023-03-18 -1.256443 -1.138627 -0.512212 -2.596556
##############
loc: Selecting by label df.loc['20230315':'20230318', ] end is included
                   A         B         C         D
2023-03-15  1.228965  0.621362 -0.333370 -0.231063
2023-03-16 -0.338434  2.228747  0.347402 -1.857681
2023-03-17  0.841055  1.094406 -0.803335 -0.719541
2023-03-18 -1.256443 -1.138627 -0.51

# loc

construct for slicing DataFrame

this construct allows to access slice / part of the dataframe based on labels of rows or columns

row labels are user defined index and column labels are column names

labels are always strings

While slicing using loc, end is included

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print("Select rows from date 15-03-2023 to 18-03-2023 \
and columns A & B df.loc['20230315':'20230318', ['A', 'B']]")

print(df.loc["20230315":"20230318", ["A", "B"]])
print(df.loc["20230315":"20230318", 'A':'B'])#'A':'C'
print("##############")

print("Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']")
print(df.loc[dates[0], 'A'])
print(df.loc["20230315", 'A'])

                   A         B         C         D
2023-03-15  2.347928 -1.265759  0.063573 -0.672446
2023-03-16 -0.376202  0.535693  1.612005  0.137367
2023-03-17  0.166560  0.975644 -2.286344 -0.326026
2023-03-18  0.619917  1.196075 -0.145592  0.324286
2023-03-19  0.504473 -0.988761 -0.455904 -0.860992
2023-03-20 -0.042869  0.731840  0.304015 -1.221846
##############
Select rows from date 15-03-2023 to 18-03-2023 and columns A & B df.loc['20230315':'20230318', ['A', 'B']]
                   A         B
2023-03-15  2.347928 -1.265759
2023-03-16 -0.376202  0.535693
2023-03-17  0.166560  0.975644
2023-03-18  0.619917  1.196075
                   A         B
2023-03-15  2.347928 -1.265759
2023-03-16 -0.376202  0.535693
2023-03-17  0.166560  0.975644
2023-03-18  0.619917  1.196075
##############
Access Single cell (return scalar value) using loc df.loc[dates[0], 'A']
2.3479276232603037
2.3479276232603037


# iloc

construct for slicing DataFrame

Selection by index (position)

uses rows number(always start from 0) and column number(always start from 0)

This returns a data frame / series

End is excluded

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.iloc[3:5, 0:2])
print(df.iloc[-3:, -3:]) #last three rows and last three columns
print(df.iloc[::2, ::2]) #alternate rows and alternate columns
print(df.iloc[::-1, ::-1]) #rows and columns in reverse order

                   A         B         C         D
2023-03-15 -1.262039 -0.120589  0.408593  1.084532
2023-03-16  0.079205 -1.950142  0.522771 -0.741448
2023-03-17 -0.408310  1.738291 -0.383810 -0.867889
2023-03-18 -1.775494  1.079105 -0.399122 -0.676271
2023-03-19  0.036376  2.378114  0.240831  0.319463
2023-03-20 -1.821443  1.031151  0.025871 -1.169785
##############
                   A         B
2023-03-18 -1.775494  1.079105
2023-03-19  0.036376  2.378114
                   B         C         D
2023-03-18  1.079105 -0.399122 -0.676271
2023-03-19  2.378114  0.240831  0.319463
2023-03-20  1.031151  0.025871 -1.169785
                   A         C
2023-03-15 -1.262039  0.408593
2023-03-17 -0.408310 -0.383810
2023-03-19  0.036376  0.240831
                   D         C         B         A
2023-03-20 -1.169785  0.025871  1.031151 -1.821443
2023-03-19  0.319463  0.240831  2.378114  0.036376
2023-03-18 -0.676271 -0.399122  1.079105 -1.775494
2023-03-17 -0.867889 -0.383810  1.738291 -0

# Access single cell in a efficient way

## at

returns a single value based on label

## iat

returns a single value based on index

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(df)
print("##############")
print(df.at['2023-03-15', 'A'])
print("##############")
print(df.iat[0, 0])

                   A         B         C         D
2023-03-15 -1.703672  1.121624  0.339477 -0.276671
2023-03-16 -0.765231  0.368803 -0.673569  0.632657
2023-03-17 -0.102623  0.805663  0.773946 -0.319639
2023-03-18 -0.641633  0.192660  0.013624  0.209108
2023-03-19  0.469935 -1.549331 -0.003246  0.919703
2023-03-20 -1.300250 -0.693520 -0.957958  0.566247
##############
-1.703672141872292
##############
-1.703672141872292


# Practice

On previously used dataframe perform slicing operations

Q1. Select rows of last 3 students  
- use loc
- use iloc

Q2. Select python marks of roll no 101 and 110
- use loc
- use iloc

Q3. Print the row of student who is 3rd highest in AA



In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
df.sort_values(by='AA',ascending=False,inplace=True)
print(df)
print("#############")
print(df.iloc[2])
print("#############")
print(df.sort_values(by='AA',ascending=False).iloc[2])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
     SQL  Python  AA
102   30      36  32
101   10      34  30
109   20      23  28
110   25      16  25
125   35      25  16
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64
#############
SQL       20
Python    23
AA        28
Name: 109, dtype: int64


# Create shallow copy of data frame


In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))
print(id(df))

df2 = df.copy()
print(id(df2))

138574702067232
138574696711744


# Add new column
## Direct assignment


In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
df['F']=10 # Works
print(df)
print("############")
df['G']=[10] # error
print(df)

                   A         B         C         D
2023-03-15 -2.581553  2.173781  0.022824  1.040360
2023-03-16  0.243397 -0.194690 -0.338433 -0.707829
2023-03-17 -1.599397 -0.599044 -0.454963 -1.603958
2023-03-18  1.724712 -0.047533 -0.543499  0.457185
2023-03-19  1.123144 -1.232555  1.241939 -0.850618
2023-03-20 -1.367577 -1.824416 -0.347196  0.611211
############
                   A         B         C         D      E
2023-03-15 -2.581553  2.173781  0.022824  1.040360    one
2023-03-16  0.243397 -0.194690 -0.338433 -0.707829    one
2023-03-17 -1.599397 -0.599044 -0.454963 -1.603958    two
2023-03-18  1.724712 -0.047533 -0.543499  0.457185  three
2023-03-19  1.123144 -1.232555  1.241939 -0.850618   four
2023-03-20 -1.367577 -1.824416 -0.347196  0.611211  three
############
                   A         B         C         D      E   F
2023-03-15 -2.581553  2.173781  0.022824  1.040360    one  10
2023-03-16  0.243397 -0.194690 -0.338433 -0.707829    one  10
2023-03-17 -1.599397 -0.5

ValueError: Length of values (1) does not match length of index (6)

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
#Using Series object with index
s1 = pd.Series([1, 2, 3, 4, 5, 6],
               index=pd.date_range("20230316", periods=6))
print(s1)
print("############")
df["Z"] = s1
print(df)

                   A         B         C         D
2023-03-15 -0.471095 -1.067597 -0.240380 -0.351255
2023-03-16 -0.139209 -0.118059 -1.282438  0.652031
2023-03-17  0.521983  1.404277  1.857252 -0.410051
2023-03-18  0.389458  0.142553  0.769469 -0.974145
2023-03-19  1.321648 -0.234852  1.121825 -0.726641
2023-03-20  0.782560 -0.682396 -0.905124  0.203923
############
2023-03-16    1
2023-03-17    2
2023-03-18    3
2023-03-19    4
2023-03-20    5
2023-03-21    6
Freq: D, dtype: int64
############
                   A         B         C         D    Z
2023-03-15 -0.471095 -1.067597 -0.240380 -0.351255  NaN
2023-03-16 -0.139209 -0.118059 -1.282438  0.652031  1.0
2023-03-17  0.521983  1.404277  1.857252 -0.410051  2.0
2023-03-18  0.389458  0.142553  0.769469 -0.974145  3.0
2023-03-19  1.321648 -0.234852  1.121825 -0.726641  4.0
2023-03-20  0.782560 -0.682396 -0.905124  0.203923  5.0


# Boolean Indexing (Filtering)

In [None]:
df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print(df[df["A"] > 0])
print("############")
print(df[[True,True,False,False,True,True]])

                   A         B         C         D
2023-03-15 -0.060126 -0.977845  1.594433  0.710496
2023-03-16  0.875255 -0.517599  0.915081 -0.869396
2023-03-17  0.927425  0.365534  0.591119 -1.024993
2023-03-18  0.333082 -0.668700  1.091995  2.339789
2023-03-19 -1.427012 -1.160465  0.931178  0.186815
2023-03-20 -0.100060 -0.069816 -0.037062  0.864942
############
                   A         B         C         D
2023-03-16  0.875255 -0.517599  0.915081 -0.869396
2023-03-17  0.927425  0.365534  0.591119 -1.024993
2023-03-18  0.333082 -0.668700  1.091995  2.339789
############
                   A         B         C         D
2023-03-15 -0.060126 -0.977845  1.594433  0.710496
2023-03-16  0.875255 -0.517599  0.915081 -0.869396
2023-03-19 -1.427012 -1.160465  0.931178  0.186815
2023-03-20 -0.100060 -0.069816 -0.037062  0.864942


In [None]:
print(df["A"] > 0)

2023-03-15    False
2023-03-16    False
2023-03-17     True
2023-03-18     True
2023-03-19     True
2023-03-20     True
Freq: D, Name: A, dtype: bool


# Select Cells by condition

- Cells which fulfill condition are returned as it is
- Cells which don't fulfill condition are given value NaN

In [None]:
print(df > 0)

                A      B      C      D
2023-03-15  False  False   True   True
2023-03-16   True  False   True  False
2023-03-17   True   True   True  False
2023-03-18   True  False   True   True
2023-03-19  False  False   True   True
2023-03-20  False  False  False   True


In [None]:
print(df[df > 0])

                   A         B         C         D
2023-03-15       NaN       NaN  1.594433  0.710496
2023-03-16  0.875255       NaN  0.915081       NaN
2023-03-17  0.927425  0.365534  0.591119       NaN
2023-03-18  0.333082       NaN  1.091995  2.339789
2023-03-19       NaN       NaN  0.931178  0.186815
2023-03-20       NaN       NaN       NaN  0.864942


# Practice selecting using condition ( Boolean Indexing)

In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


## Using loc

In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have passed in SQL")
print(df.loc[df['SQL'] > 16])
print("#############")
print("Select all students who have passed in SQL and Python")
#print(df[(df['SQL'] > 16) and (df['Python'] > 16)]) # Error
print(df.loc[(df['SQL'] > 16) & (df['Python'] > 16)])
print("#############")
print("Select all students who have failed in SQL or AA")
#print(df[(df['SQL'] < 16) or (df['AA'] < 16)]) # Error
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16)])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have passed in SQL and Python
     SQL  Python  AA
109   20      23  28
102   30      36  32
125   35      25  16
#############
Select all students who have failed in SQL or AA
     SQL  Python  AA
101   10      34  30


In [None]:
d= {
    'SQL':[10,20,30,35,25],
    'Python':[34,23,36,25,16],
    'AA':[30,28,32,16,25],
}
df = pd.DataFrame(d, index = [101,109,102,125,110])
print(df)
print("#############")
print("Select all students who have failed in SQL or AA, display ONLY SQL and AA marks")
print(df.loc[(df['SQL'] < 16) | (df['AA'] < 16), ['SQL', 'AA'] ])

     SQL  Python  AA
101   10      34  30
109   20      23  28
102   30      36  32
125   35      25  16
110   25      16  25
#############
Select all students who have failed in SQL or AA, display ONLY SQL and AA marks
     SQL  AA
101   10  30


# Filetering using multiple values

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
df["E"] = ["one", "one", "two",
           "three", "four", "three",
           ]
print(df)
print("############")
print(df[df["E"].isin(["two", "four"])])

                   A         B         C         D
2023-03-15  2.006785  0.794160 -0.076253 -0.896186
2023-03-16  1.215664  1.944675 -0.485328 -0.057103
2023-03-17  1.065154 -0.407460 -0.177700 -0.717367
2023-03-18  1.161736  2.373098 -1.681781 -0.797907
2023-03-19  0.794486  2.381712  0.351875  0.771906
2023-03-20 -2.083229  1.117912 -0.959905  0.128406
############
                   A         B         C         D      E
2023-03-15  2.006785  0.794160 -0.076253 -0.896186    one
2023-03-16  1.215664  1.944675 -0.485328 -0.057103    one
2023-03-17  1.065154 -0.407460 -0.177700 -0.717367    two
2023-03-18  1.161736  2.373098 -1.681781 -0.797907  three
2023-03-19  0.794486  2.381712  0.351875  0.771906   four
2023-03-20 -2.083229  1.117912 -0.959905  0.128406  three
############
                   A         B         C         D     E
2023-03-17  1.065154 -0.407460 -0.177700 -0.717367   two
2023-03-19  0.794486  2.381712  0.351875  0.771906  four


In [None]:
df["E"].isin(["two", "four"])

2023-03-15    False
2023-03-16    False
2023-03-17     True
2023-03-18    False
2023-03-19     True
2023-03-20    False
Freq: D, Name: E, dtype: bool

In [None]:
("two", "four") in df["E"]

InvalidIndexError: ('two', 'four')

# Update single cell

###using label -> date and column "A"

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")

df.at['2023-03-15', "A"] = 0

print(df)

#using index -> first row and third column
df.iat[0, 2] = 0

print(df)

# Using Numpy array
df.loc[:, "D"] = np.array([5] * len(df))
df.loc[:, "D"] = [5] * len(df)
print(df)

# Update using condition
df2 = df.copy()
df2['E']=10
df2[df2 <= 0] = -df2

print(df2)



                   A         B         C         D
2023-03-15 -0.939359 -1.464055 -0.307643  0.501447
2023-03-16 -0.141483 -0.516536  0.322430 -1.801717
2023-03-17 -0.238925  0.425652 -1.622039  0.164146
2023-03-18 -1.608147 -0.861482 -0.151784 -1.444840
2023-03-19  0.997751  0.275147  0.203555 -1.555191
2023-03-20  0.700680  0.837179 -0.186357 -0.109503
############
                   A         B         C         D
2023-03-15  0.000000 -1.464055 -0.307643  0.501447
2023-03-16 -0.141483 -0.516536  0.322430 -1.801717
2023-03-17 -0.238925  0.425652 -1.622039  0.164146
2023-03-18 -1.608147 -0.861482 -0.151784 -1.444840
2023-03-19  0.997751  0.275147  0.203555 -1.555191
2023-03-20  0.700680  0.837179 -0.186357 -0.109503
                   A         B         C         D
2023-03-15  0.000000 -1.464055  0.000000  0.501447
2023-03-16 -0.141483 -0.516536  0.322430 -1.801717
2023-03-17 -0.238925  0.425652 -1.622039  0.164146
2023-03-18 -1.608147 -0.861482 -0.151784 -1.444840
2023-03-19  0.9977

# Mean Median Mode of All columns



### When all columns are numbers ( continuous)

In [None]:
from numpy.random import default_rng
rng = default_rng()

df = pd.DataFrame(rng.standard_normal((6, 4)),
                  index=dates,
                  columns=list("ABCD"))

print(df)
print("############")
print("Mean")
print(df.mean()) #df['A'].mean()
print("############")
print("Median")
print(df.median()) #df['B'].median()

                   A         B         C         D
2023-03-15  0.936454 -0.522897 -0.662565  0.491822
2023-03-16  0.201702 -0.144814  1.171709 -0.465899
2023-03-17 -1.976228 -0.442338  2.584648 -1.740182
2023-03-18 -0.381005  0.611932  0.944455  1.856276
2023-03-19  0.460962  0.717926 -3.191161 -0.123803
2023-03-20  0.074121  0.363126 -1.365758  2.240260
############
Mean
A   -0.113999
B    0.097156
C   -0.086445
D    0.376412
dtype: float64
############
Median
A    0.137911
B    0.109156
C    0.140945
D    0.184010
dtype: float64


### When there is mix of categorical (String) and continuous ( number) columns

In [None]:
df1 = pd.DataFrame()
df1['cat1'] = ['A','B','B','B','C','D']
df1['cat2'] = [1,1,2,2,3,4]
df1['rno'] = [1,2,3,4,5,6]
df1['marks']= [23,34,39,16,10,25]
print(df1)
print("############")
print("Mean")
print(df1.iloc[:,2:].mean())
print("############")
print("Median")
print(df1.iloc[:,2:].median())
print("############")
print("Mode :: May return multiple values")
print(df1.mode())

  cat1  cat2  rno  marks
0    A     1    1     23
1    B     1    2     34
2    B     2    3     39
3    B     2    4     16
4    C     3    5     10
5    D     4    6     25
############
Mean
rno       3.5
marks    24.5
dtype: float64
############
Median
rno       3.5
marks    24.0
dtype: float64
############
Mode :: May return multiple values
  cat1  cat2  rno  marks
0    B   1.0    1     10
1  NaN   2.0    2     16
2  NaN   NaN    3     23
3  NaN   NaN    4     25
4  NaN   NaN    5     34
5  NaN   NaN    6     39
