In [2]:
import pandas as pd
import numpy as np

In [3]:
dates = pd.date_range("20230719",periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates,columns=list("ABCD"))

In [49]:
# Print the dataframe
print(df)

                   A         B         C         D
2023-07-19 -0.419142 -0.638655  0.375973  0.088315
2023-07-20  0.895699  1.011468  0.568292 -0.209315
2023-07-21  0.931920  2.307566 -1.588165  2.652617
2023-07-22 -0.345990 -0.452882  0.145852  1.154145
2023-07-23  0.307311 -0.343896 -0.147379 -1.260732
2023-07-24  0.219200  0.024178  1.063994 -0.025955


In [50]:
# Describe the dataframe - shows count, mean, std, min, max values for numerical data
print(df.describe())

              A         B         C         D
count  6.000000  6.000000  6.000000  6.000000
mean   0.264833  0.317963  0.069761  0.399846
std    0.581001  1.137818  0.909115  1.345429
min   -0.419142 -0.638655 -1.588165 -1.260732
25%   -0.204693 -0.425635 -0.074071 -0.163475
50%    0.263255 -0.159859  0.260912  0.031180
75%    0.748602  0.764645  0.520212  0.887688
max    0.931920  2.307566  1.063994  2.652617


In [51]:
# Displays information about the dataframe - index, columns, their datatypes
print(df.info())

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 6 entries, 2023-07-19 to 2023-07-24
Freq: D
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       6 non-null      float64
 1   B       6 non-null      float64
 2   C       6 non-null      float64
 3   D       6 non-null      float64
dtypes: float64(4)
memory usage: 240.0 bytes
None


In [53]:
# Display the top n rows of the dataframe
print(df.head(2))

                   A         B         C         D
2023-07-19 -0.419142 -0.638655  0.375973  0.088315
2023-07-20  0.895699  1.011468  0.568292 -0.209315


In [55]:
# Display the last n rows of the dataframe
print(df.tail(2))

                   A         B         C         D
2023-07-23  0.307311 -0.343896 -0.147379 -1.260732
2023-07-24  0.219200  0.024178  1.063994 -0.025955


In [43]:
# Access dataframe by label - single value and slice
print(df.loc["2023-07-19":, "A":"C"])
print(df.at["2023-07-19", "C"])
df.get(["A","B"])

                   A         B         C
2023-07-19  0.682827 -0.730617 -0.476857
2023-07-20 -0.372170 -0.137440  0.322904
2023-07-21  0.407933 -1.075668 -0.689897
2023-07-22  0.326363 -0.388607 -1.386280
2023-07-23 -0.418049  0.791698 -1.570367
2023-07-24 -0.719027 -0.247697  1.421375
-0.4768571129841587


Unnamed: 0,A,B
2023-07-19,0.682827,-0.730617
2023-07-20,-0.37217,-0.13744
2023-07-21,0.407933,-1.075668
2023-07-22,0.326363,-0.388607
2023-07-23,-0.418049,0.791698
2023-07-24,-0.719027,-0.247697


In [64]:
# Access dataframe by location - single value and slice
print(df.iloc[1:,1:3])
print(df.iat[1,2])

                   B         C
2023-07-20  1.011468  0.568292
2023-07-21  2.307566 -1.588165
2023-07-22 -0.452882  0.145852
2023-07-23 -0.343896 -0.147379
2023-07-24  0.024178  1.063994
0.5682918679092918


In [6]:
# Boolean indexing
print(df[df["A"] > 0])

                   A         B         C         D
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253


In [12]:
# Fill NaN values with a filler
df.iat[0,0] = np.nan
df.fillna(value=4.5)

Unnamed: 0,A,B,C,D
2023-07-19,4.5,0.490573,-0.277719,1.604353
2023-07-20,-1.298206,1.159571,0.837443,0.39705
2023-07-21,-0.687767,-0.721275,-0.628443,-0.041859
2023-07-22,1.298161,0.962459,-0.243521,1.36197
2023-07-23,0.127516,0.419395,-0.234849,-0.518253
2023-07-24,-0.006577,-0.042533,-1.818383,0.345533


In [18]:
# Drop the rows consisting of empty rows
df.at["2023-07-19", "A"] = np.nan
df.dropna(how="any", inplace=True)
print(df)

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533


In [20]:
# N largest and smallest rows
print(df.nlargest(2, 'C', keep="all"))
print(df.nsmallest(2, 'A', keep="first"))

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859


In [21]:
# Get a sample of the data
print(df.sample(frac=0.2))
print(df.sample(n=2))

                   A         B         C         D
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533
                   A         B         C         D
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859


In [23]:
# Select multiple columns
print(df[["A","B"]])

                   A         B
2023-07-20 -1.298206  1.159571
2023-07-21 -0.687767 -0.721275
2023-07-22  1.298161  0.962459
2023-07-23  0.127516  0.419395
2023-07-24 -0.006577 -0.042533


In [29]:
# Method chaining -
print(df[df["A"] >0].isna())

                A      B      C      D
2023-07-22  False  False  False  False
2023-07-23  False  False  False  False


In [34]:
# Drop duplicates
dupe_df = pd.concat([df, df])
print(dupe_df.drop_duplicates(keep="last"))

                   A         B         C         D
2023-07-20 -1.298206  1.159571  0.837443  0.397050
2023-07-21 -0.687767 -0.721275 -0.628443 -0.041859
2023-07-22  1.298161  0.962459 -0.243521  1.361970
2023-07-23  0.127516  0.419395 -0.234849 -0.518253
2023-07-24 -0.006577 -0.042533 -1.818383  0.345533


In [8]:
# Sorting values
print(df.sort_values(['A', 'B'], ascending=False))

                   A         B         C         D
2023-07-21  1.084578  0.703158  0.532485  0.620870
2023-07-22  0.126723  0.429683 -0.592764 -0.748332
2023-07-20  0.093755 -0.488334 -0.740393  0.449058
2023-07-23 -0.773718 -1.172355  0.153394  0.126488
2023-07-24 -1.409514 -0.233296  0.811088  1.180060
2023-07-19 -2.282732  0.009799  0.922575  0.775262


In [28]:
# Rename columns
df1 = df.copy(deep=True)
df1.rename(columns={"A":"a"})

Unnamed: 0,a,B,C,D
2023-07-19,0.682827,-0.730617,-0.476857,-0.865903
2023-07-20,-0.37217,-0.13744,0.322904,-0.244547
2023-07-21,0.407933,-1.075668,-0.689897,1.451286
2023-07-22,0.326363,-0.388607,-1.38628,0.327189
2023-07-23,-0.418049,0.791698,-1.570367,0.620792
2023-07-24,-0.719027,-0.247697,1.421375,-0.389758


In [29]:
# Sort, reset index
df1.sort_index(ascending=False)
df1 = df1.reset_index()

In [27]:
# Drop columns
df1.drop(columns=["A"])

Unnamed: 0,B,C,D
2023-07-19,0.009799,0.922575,0.775262
2023-07-20,-0.488334,-0.740393,0.449058
2023-07-21,0.703158,0.532485,0.62087
2023-07-22,0.429683,-0.592764,-0.748332
2023-07-23,-1.172355,0.153394,0.126488
2023-07-24,-0.233296,0.811088,1.18006


In [30]:
# Filter values based on conditions
df.loc[(df["D"] > 0.5) & (df["A"] < 0)]
df.loc[(df["D"] < 0) | (df["A"] > 0)]
df1.loc[df1["B"] < 0]

Unnamed: 0,index,A,B,C,D
0,2023-07-19,0.682827,-0.730617,-0.476857,-0.865903
1,2023-07-20,-0.37217,-0.13744,0.322904,-0.244547
2,2023-07-21,0.407933,-1.075668,-0.689897,1.451286
3,2023-07-22,0.326363,-0.388607,-1.38628,0.327189
5,2023-07-24,-0.719027,-0.247697,1.421375,-0.389758


In [56]:
# Summarize data
df.value_counts()
df["D"].value_counts()
df["D"].nunique()
df["D"].min()
df["A"].max()
df
# std(), mean(), var(), count()

Unnamed: 0,A,B,C,D
2023-07-19,-2.282732,0.009799,0.922575,0.775262
2023-07-20,0.093755,-0.488334,-0.740393,0.449058
2023-07-21,1.084578,0.703158,0.532485,0.62087
2023-07-22,0.126723,0.429683,-0.592764,-0.748332
2023-07-23,-0.773718,-1.172355,0.153394,0.126488
2023-07-24,-1.409514,-0.233296,0.811088,1.18006


In [74]:
# Combine data sets
df1
df2 = pd.DataFrame(np.random.randn(6,4), columns=list("ABCD"))
df3 = pd.merge(df1, df2, how="outer"s)
print(df3)

           A         B         C         D
0  -2.282732  0.009799  0.922575  0.775262
1   0.093755 -0.488334 -0.740393  0.449058
2   1.084578  0.703158  0.532485  0.620870
3   0.126723  0.429683 -0.592764 -0.748332
4  -0.773718 -1.172355  0.153394  0.126488
5  -1.409514 -0.233296  0.811088  1.180060
6   1.257549  0.331895 -1.307253  0.784835
7  -0.499292  0.833811  0.385207  0.222099
8   1.568828 -0.386995 -0.538087 -1.062356
9   0.128885 -1.465330  0.987263  1.700842
10 -1.424596  1.072531 -0.047494  0.066366
11 -2.327641 -0.581432 -0.856612  0.285559


In [21]:
# Apply functions
df.apply(lambda x:x**2)

Unnamed: 0,A,B,C,D
2023-07-19,0.466252,0.533801,0.227393,0.749788
2023-07-20,0.138511,0.01889,0.104267,0.059803
2023-07-21,0.166409,1.157062,0.475958,2.106232
2023-07-22,0.106513,0.151016,1.921772,0.107053
2023-07-23,0.174765,0.626786,2.466051,0.385382
2023-07-24,0.517,0.061354,2.020305,0.151912


In [23]:
df.applymap(lambda x:x**2)

Unnamed: 0,A,B,C,D
2023-07-19,0.466252,0.533801,0.227393,0.749788
2023-07-20,0.138511,0.01889,0.104267,0.059803
2023-07-21,0.166409,1.157062,0.475958,2.106232
2023-07-22,0.106513,0.151016,1.921772,0.107053
2023-07-23,0.174765,0.626786,2.466051,0.385382
2023-07-24,0.517,0.061354,2.020305,0.151912


In [35]:
# Iteration methods
for name, series in df.iterrows():
    print("Row: {}".format(name) + "\n"+ "Series: {}".format(series))

Row: 2023-07-19 00:00:00
Series: A    0.682827
B   -0.730617
C   -0.476857
D   -0.865903
Name: 2023-07-19 00:00:00, dtype: float64
Row: 2023-07-20 00:00:00
Series: A   -0.372170
B   -0.137440
C    0.322904
D   -0.244547
Name: 2023-07-20 00:00:00, dtype: float64
Row: 2023-07-21 00:00:00
Series: A    0.407933
B   -1.075668
C   -0.689897
D    1.451286
Name: 2023-07-21 00:00:00, dtype: float64
Row: 2023-07-22 00:00:00
Series: A    0.326363
B   -0.388607
C   -1.386280
D    0.327189
Name: 2023-07-22 00:00:00, dtype: float64
Row: 2023-07-23 00:00:00
Series: A   -0.418049
B    0.791698
C   -1.570367
D    0.620792
Name: 2023-07-23 00:00:00, dtype: float64
Row: 2023-07-24 00:00:00
Series: A   -0.719027
B   -0.247697
C    1.421375
D   -0.389758
Name: 2023-07-24 00:00:00, dtype: float64


In [47]:
# Maths methods
df["A"].min()
df.min()
df["A"].max()
df.max()

A    0.682827
B    0.791698
C    1.421375
D    1.451286
dtype: float64