# Pandas

In [2]:
import numpy as np
import pandas as pd
np.__version__


'1.21.2'

# Object creation

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6,5), index=dates, columns=list("ABDCX"))
df

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


In [6]:
np.random.randn(6,4)

array([[-1.4831061 , -0.05917427,  0.05660528, -1.91197168],
       [ 0.29200507, -0.38247488,  1.28647817, -0.53702362],
       [ 1.48020399,  0.68177857, -0.40842209,  0.94795461],
       [-0.67886302,  0.79578697,  1.31842134, -1.024001  ],
       [ 0.25407842, -0.69201965, -0.72099346,  2.11405404],
       [ 1.14080231, -0.13236516, -0.87011317,  0.11529735]])

In [7]:
df2 = pd.DataFrame(
    {
        "A": 1.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)),dtype="float32"),
        "D": np.array([3]*4,dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo"
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [8]:
df2.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

# Viewing data

In [9]:
df.head()

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254


In [10]:
df.tail(4)

Unnamed: 0,A,B,D,C,X
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


In [11]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [12]:
df.columns

Index(['A', 'B', 'D', 'C', 'X'], dtype='object')

In [13]:
df.to_numpy()

array([[-0.02113627,  0.47419397, -1.73411872, -0.43553761,  0.69052973],
       [ 0.73436247,  0.64716839, -0.59334352, -0.59715386,  1.0381247 ],
       [-1.17143602,  1.51536588, -0.60997875,  0.118545  , -0.5361404 ],
       [ 0.00830445, -0.41603319, -1.17184114,  2.60402927,  0.99074828],
       [-1.45208035,  0.10804344,  0.1420329 , -0.77397265, -0.14825438],
       [ 0.29432745,  1.85214782,  0.31375664, -0.57115065,  0.64347273]])

In [14]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [15]:
df.describe()

Unnamed: 0,A,B,D,C,X
count,6.0,6.0,6.0,6.0,6.0
mean,-0.267943,0.696814,-0.608915,0.05746,0.446413
std,0.857493,0.853455,0.774398,1.28427,0.642478
min,-1.45208,-0.416033,-1.734119,-0.773973,-0.53614
25%,-0.883861,0.199581,-1.031376,-0.590653,0.049677
50%,-0.006416,0.560681,-0.601661,-0.503344,0.667001
75%,0.222822,1.298317,-0.041811,-0.019976,0.915694
max,0.734362,1.852148,0.313757,2.604029,1.038125


In [18]:
df

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


In [30]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,X,D,C,B,A
2013-01-01,0.69053,-1.734119,-0.435538,0.474194,-0.021136
2013-01-02,1.038125,-0.593344,-0.597154,0.647168,0.734362
2013-01-03,-0.53614,-0.609979,0.118545,1.515366,-1.171436
2013-01-04,0.990748,-1.171841,2.604029,-0.416033,0.008304
2013-01-05,-0.148254,0.142033,-0.773973,0.108043,-1.45208
2013-01-06,0.643473,0.313757,-0.571151,1.852148,0.294327


In [32]:
df.sort_values(by="X", ascending=False)

Unnamed: 0,A,B,D,C,X
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614


# Getting

In [34]:
df.D

2013-01-01   -1.734119
2013-01-02   -0.593344
2013-01-03   -0.609979
2013-01-04   -1.171841
2013-01-05    0.142033
2013-01-06    0.313757
Freq: D, Name: D, dtype: float64

In [35]:
df[0:2]

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125


In [37]:
df["2013-01-05":"2013-01-06"]

Unnamed: 0,A,B,D,C,X
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


# Selection by label

In [40]:
df.loc[dates[0]]

A   -0.021136
B    0.474194
D   -1.734119
C   -0.435538
X    0.690530
Name: 2013-01-01 00:00:00, dtype: float64

In [41]:
df

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


In [43]:
df.loc[:,["D","X"]]

Unnamed: 0,D,X
2013-01-01,-1.734119,0.69053
2013-01-02,-0.593344,1.038125
2013-01-03,-0.609979,-0.53614
2013-01-04,-1.171841,0.990748
2013-01-05,0.142033,-0.148254
2013-01-06,0.313757,0.643473


In [46]:
df.loc["2013-01-02":"2013-01-03", ["X", "C","D"]]

Unnamed: 0,X,C,D
2013-01-02,1.038125,-0.597154,-0.593344
2013-01-03,-0.53614,0.118545,-0.609979


In [47]:
df.loc["20130103", "C"]

0.11854499761227834

In [48]:
df.at[dates[2],"X"]

-0.5361404020125728

In [49]:
df.iloc[1]

A    0.734362
B    0.647168
D   -0.593344
C   -0.597154
X    1.038125
Name: 2013-01-02 00:00:00, dtype: float64

In [52]:
df.iloc[3:5, 2:]

Unnamed: 0,D,C,X
2013-01-04,-1.171841,2.604029,0.990748
2013-01-05,0.142033,-0.773973,-0.148254


In [53]:
df.iloc[[1,3,4],[2,0]]

Unnamed: 0,D,A
2013-01-02,-0.593344,0.734362
2013-01-04,-1.171841,0.008304
2013-01-05,0.142033,-1.45208


In [56]:
df.iloc[0:3,:]

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614


In [57]:
df.iloc[:,0:3]

Unnamed: 0,A,B,D
2013-01-01,-0.021136,0.474194,-1.734119
2013-01-02,0.734362,0.647168,-0.593344
2013-01-03,-1.171436,1.515366,-0.609979
2013-01-04,0.008304,-0.416033,-1.171841
2013-01-05,-1.45208,0.108043,0.142033
2013-01-06,0.294327,1.852148,0.313757


In [58]:
df.iloc[1,1]

0.6471683862843268

In [59]:
df.iat[0,0]

-0.021136266488595828

# Boolean indexing

In [65]:
df[df["A"]>0]

Unnamed: 0,A,B,D,C,X
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473


In [66]:
df[df<0]

Unnamed: 0,A,B,D,C,X
2013-01-01,-0.021136,,-1.734119,-0.435538,
2013-01-02,,,-0.593344,-0.597154,
2013-01-03,-1.171436,,-0.609979,,-0.53614
2013-01-04,,-0.416033,-1.171841,,
2013-01-05,-1.45208,,,-0.773973,-0.148254
2013-01-06,,,,-0.571151,


## Using two column values to select data

In [68]:
df[(df["A"]>0) & (df["D"]<0)]

Unnamed: 0,A,B,D,C,X
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748


In [69]:
df2 = df.copy()
df2["E"] = ["one", "one", "two", "three", "four", "three"]
df2

Unnamed: 0,A,B,D,C,X,E
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053,one
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125,one
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614,two
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748,three
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254,four
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473,three


In [70]:
df2[df2["E"].isin(["one", "three"])]

Unnamed: 0,A,B,D,C,X,E
2013-01-01,-0.021136,0.474194,-1.734119,-0.435538,0.69053,one
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125,one
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748,three
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473,three


# Setting

In [72]:
s1 = pd.Series([1, 2, 3, 4, 5, 6], index = pd.date_range("20130102", periods=6))
s1
df["F"] = s1

In [80]:
df.at[dates[0], "B"] = 0

In [84]:
df.iat[0,1] = 0
df

Unnamed: 0,A,B,D,C,X,F
2013-01-01,-0.021136,0.0,-1.734119,-0.435538,0.69053,
2013-01-02,0.734362,0.647168,-0.593344,-0.597154,1.038125,1.0
2013-01-03,-1.171436,1.515366,-0.609979,0.118545,-0.53614,2.0
2013-01-04,0.008304,-0.416033,-1.171841,2.604029,0.990748,3.0
2013-01-05,-1.45208,0.108043,0.142033,-0.773973,-0.148254,4.0
2013-01-06,0.294327,1.852148,0.313757,-0.571151,0.643473,5.0


In [85]:
df.loc[:, "D"]

2013-01-01   -1.734119
2013-01-02   -0.593344
2013-01-03   -0.609979
2013-01-04   -1.171841
2013-01-05    0.142033
2013-01-06    0.313757
Freq: D, Name: D, dtype: float64

In [92]:
df.loc[:, "D"] = np.array([5] * len(df))
df

Unnamed: 0,A,B,D,C,X,F
2013-01-01,-0.021136,0.0,5,-0.435538,0.69053,
2013-01-02,0.734362,0.647168,5,-0.597154,1.038125,1.0
2013-01-03,-1.171436,1.515366,5,0.118545,-0.53614,2.0
2013-01-04,0.008304,-0.416033,5,2.604029,0.990748,3.0
2013-01-05,-1.45208,0.108043,5,-0.773973,-0.148254,4.0
2013-01-06,0.294327,1.852148,5,-0.571151,0.643473,5.0


In [95]:
df2 = df.copy()
df2[df2>0] = -df2
df2

Unnamed: 0,A,B,D,C,X,F
2013-01-01,-0.021136,0.0,-5,-0.435538,-0.69053,
2013-01-02,-0.734362,-0.647168,-5,-0.597154,-1.038125,-1.0
2013-01-03,-1.171436,-1.515366,-5,-0.118545,-0.53614,-2.0
2013-01-04,-0.008304,-0.416033,-5,-2.604029,-0.990748,-3.0
2013-01-05,-1.45208,-0.108043,-5,-0.773973,-0.148254,-4.0
2013-01-06,-0.294327,-1.852148,-5,-0.571151,-0.643473,-5.0


# Missing data

In [104]:
df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ["E"])
df1.loc[dates[0] : dates[1], "E"] = 1
df1

Unnamed: 0,A,B,D,C,X,F,E
2013-01-01,-0.021136,0.0,5,-0.435538,0.69053,,1.0
2013-01-02,0.734362,0.647168,5,-0.597154,1.038125,1.0,1.0
2013-01-03,-1.171436,1.515366,5,0.118545,-0.53614,2.0,
2013-01-04,0.008304,-0.416033,5,2.604029,0.990748,3.0,


In [106]:
df1.dropna(how="any")

Unnamed: 0,A,B,D,C,X,F,E
2013-01-02,0.734362,0.647168,5,-0.597154,1.038125,1.0,1.0


In [107]:
df1.fillna(value=5)

Unnamed: 0,A,B,D,C,X,F,E
2013-01-01,-0.021136,0.0,5,-0.435538,0.69053,5.0,1.0
2013-01-02,0.734362,0.647168,5,-0.597154,1.038125,1.0,1.0
2013-01-03,-1.171436,1.515366,5,0.118545,-0.53614,2.0,5.0
2013-01-04,0.008304,-0.416033,5,2.604029,0.990748,3.0,5.0


In [108]:
pd.isna(df1)

Unnamed: 0,A,B,D,C,X,F,E
2013-01-01,False,False,False,False,False,True,False
2013-01-02,False,False,False,False,False,False,False
2013-01-03,False,False,False,False,False,False,True
2013-01-04,False,False,False,False,False,False,True


# Operations

In [109]:
df.mean()

A   -0.267943
B    0.617782
D    5.000000
C    0.057460
X    0.446413
F    3.000000
dtype: float64

In [110]:
df.mean(1)

2013-01-01    1.046771
2013-01-02    1.303750
2013-01-03    1.154389
2013-01-04    1.864508
2013-01-05    1.122289
2013-01-06    2.036466
Freq: D, dtype: float64

In [116]:
s = pd.Series([1, 3, 5, np.nan, 6, 8], index=dates).shift(2)
s

2013-01-01    NaN
2013-01-02    NaN
2013-01-03    1.0
2013-01-04    3.0
2013-01-05    5.0
2013-01-06    NaN
Freq: D, dtype: float64

In [118]:
df.sub(s, axis="index")

Unnamed: 0,A,B,D,C,X,F
2013-01-01,,,,,,
2013-01-02,,,,,,
2013-01-03,-2.171436,0.515366,4.0,-0.881455,-1.53614,1.0
2013-01-04,-2.991696,-3.416033,2.0,-0.395971,-2.009252,0.0
2013-01-05,-6.45208,-4.891957,0.0,-5.773973,-5.148254,-1.0
2013-01-06,,,,,,


# Apply

In [121]:
df.apply(np.cumsum)

Unnamed: 0,A,B,D,C,X,F
2013-01-01,-0.021136,0.0,5,-0.435538,0.69053,
2013-01-02,0.713226,0.647168,10,-1.032691,1.728654,1.0
2013-01-03,-0.45821,2.162534,15,-0.914146,1.192514,3.0
2013-01-04,-0.449905,1.746501,20,1.689883,2.183262,6.0
2013-01-05,-1.901986,1.854545,25,0.91591,2.035008,10.0
2013-01-06,-1.607658,3.706692,30,0.344759,2.678481,15.0


In [122]:
df

Unnamed: 0,A,B,D,C,X,F
2013-01-01,-0.021136,0.0,5,-0.435538,0.69053,
2013-01-02,0.734362,0.647168,5,-0.597154,1.038125,1.0
2013-01-03,-1.171436,1.515366,5,0.118545,-0.53614,2.0
2013-01-04,0.008304,-0.416033,5,2.604029,0.990748,3.0
2013-01-05,-1.45208,0.108043,5,-0.773973,-0.148254,4.0
2013-01-06,0.294327,1.852148,5,-0.571151,0.643473,5.0


In [None]:
s = pd.Series(np.random.randint(0,7, size=10))
s.value_counts()

# String methods

In [None]:
s = pd.Series(["A", "B", "C", "Aaba", "Baca", np.nan, "CABA", "dog", "cat"])
s.str.lower()

# Merge

## Concat

In [None]:
df = pd.DataFrame(np.random.randn(10,4))
df

In [None]:
pieces = [df[:3], df[3:7], df[7:]]
pieces

In [None]:
pd.concat(pieces)