In [3]:
import pandas as pd
import numpy as np

In [4]:
dates = pd.date_range("20230719",periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates,columns=list("ABCD"))

In [49]:
# Print the dataframe
print(df)

                   A         B         C         D
2023-07-19 -0.419142 -0.638655  0.375973  0.088315
2023-07-20  0.895699  1.011468  0.568292 -0.209315
2023-07-21  0.931920  2.307566 -1.588165  2.652617
2023-07-22 -0.345990 -0.452882  0.145852  1.154145
2023-07-23  0.307311 -0.343896 -0.147379 -1.260732
2023-07-24  0.219200  0.024178  1.063994 -0.025955


In [50]:
# Describe the dataframe - shows count, mean, std, min, max values for numerical data
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.264833  0.317963  0.069761  0.399846
std    0.581001  1.137818  0.909115  1.345429
min   -0.419142 -0.638655 -1.588165 -1.260732
25%   -0.204693 -0.425635 -0.074071 -0.163475
50%    0.263255 -0.159859  0.260912  0.031180
75%    0.748602  0.764645  0.520212  0.887688
max    0.931920  2.307566  1.063994  2.652617


In [51]:
# Displays information about the dataframe - index, columns, their datatypes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2023-07-19 to 2023-07-24
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes
None


In [53]:
# Display the top n rows of the dataframe
print(df.head(2))

                   A         B         C         D
2023-07-19 -0.419142 -0.638655  0.375973  0.088315
2023-07-20  0.895699  1.011468  0.568292 -0.209315


In [55]:
# Display the last n rows of the dataframe
print(df.tail(2))

                   A         B         C         D
2023-07-23  0.307311 -0.343896 -0.147379 -1.260732
2023-07-24  0.219200  0.024178  1.063994 -0.025955


In [62]:
# Access dataframe by label - single value and slice
print(df.loc["2023-07-19":, "A":"C"])
print(df.at["2023-07-19", "C"])

                   A         B         C
2023-07-19 -0.419142 -0.638655  0.375973
2023-07-20  0.895699  1.011468  0.568292
2023-07-21  0.931920  2.307566 -1.588165
2023-07-22 -0.345990 -0.452882  0.145852
2023-07-23  0.307311 -0.343896 -0.147379
2023-07-24  0.219200  0.024178  1.063994
0.3759727878182524


In [64]:
# Access dataframe by location - single value and slice
print(df.iloc[1:,1:3])
print(df.iat[1,2])

                   B         C
2023-07-20  1.011468  0.568292
2023-07-21  2.307566 -1.588165
2023-07-22 -0.452882  0.145852
2023-07-23 -0.343896 -0.147379
2023-07-24  0.024178  1.063994
0.5682918679092918


In [6]:
# Boolean indexing
print(df[df["A"] > 0])

                   A         B         C         D
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253


In [12]:
# Fill NaN values with a filler
df.iat[0,0] = np.nan
df.fillna(value=4.5)

Unnamed: 0,A,B,C,D
2023-07-19,4.5,0.490573,-0.277719,1.604353
2023-07-20,-1.298206,1.159571,0.837443,0.39705
2023-07-21,-0.687767,-0.721275,-0.628443,-0.041859
2023-07-22,1.298161,0.962459,-0.243521,1.36197
2023-07-23,0.127516,0.419395,-0.234849,-0.518253
2023-07-24,-0.006577,-0.042533,-1.818383,0.345533


In [18]:
# Drop the rows consisting of empty rows
df.at["2023-07-19", "A"] = np.nan
df.dropna(how="any", inplace=True)
print(df)

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533


In [20]:
# N largest and smallest rows
print(df.nlargest(2, 'C', keep="all"))
print(df.nsmallest(2, 'A', keep="first"))

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859


In [21]:
# Get a sample of the data
print(df.sample(frac=0.2))
print(df.sample(n=2))

                   A         B         C         D
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533
                   A         B         C         D
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859


In [23]:
# Select multiple columns
print(df[["A","B"]])

                   A         B
2023-07-20 -1.298206  1.159571
2023-07-21 -0.687767 -0.721275
2023-07-22  1.298161  0.962459
2023-07-23  0.127516  0.419395
2023-07-24 -0.006577 -0.042533


In [29]:
# Method chaining -
print(df[df["A"] >0].isna())

                A      B      C      D
2023-07-22  False  False  False  False
2023-07-23  False  False  False  False


In [34]:
# Drop duplicates
dupe_df = pd.concat([df, df])
print(dupe_df.drop_duplicates(keep="last"))

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533
