In [1]:
import numpy as np
import pandas as pd

#### Creating a Series by passing a list of values

In [2]:
pd.Series(data=[1,3,5,7,np.nan,8,9])

0    1.0
1    3.0
2    5.0
3    7.0
4    NaN
5    8.0
6    9.0
dtype: float64

#### Creating a DataFrame by passing a Numpy array with date time index and tabled columns

In [3]:
dates = pd.date_range("20210414",periods=6)
dates

DatetimeIndex(['2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17',
               '2021-04-18', '2021-04-19'],
              dtype='datetime64[ns]', freq='D')

In [4]:
data = np.random.randn(6,4)
data

array([[ 0.55314246,  0.67900938, -0.54922815,  1.80464333],
       [-0.71818104,  0.33499589,  0.72261684,  0.50193289],
       [-1.79677165, -0.220665  ,  0.47831967,  0.7413944 ],
       [-0.63999307, -1.73622828, -0.16395829,  0.10901429],
       [ 0.90653681, -0.39356491, -0.19510729, -1.05329046],
       [ 0.55336638, -0.02827893,  0.69234188, -0.43538553]])

In [5]:
col = ['A','B','C','D']
col

['A', 'B', 'C', 'D']

In [6]:
df = pd.DataFrame(data=data, index=dates, columns=col)
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [7]:
df.head()

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329


In [8]:
df.tail()

Unnamed: 0,A,B,C,D
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [9]:
df.index

DatetimeIndex(['2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17',
               '2021-04-18', '2021-04-19'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

#### Creating a DataFrame by passing a dict of objects that can be converted into series

In [11]:
df1 = pd.DataFrame(
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
            "D": np.array([3] * 4, dtype="int32"),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
        }
    )
df1

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
pd.Timestamp("20130102")

Timestamp('2013-01-02 00:00:00')

In [13]:
pd.Series(1, index=list(range(4)), dtype="float32")

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [14]:
np.array([3] * 4, dtype="int32")

array([3, 3, 3, 3])

In [15]:
pd.Categorical(["test", "train", "test", "train"])

[test, train, test, train]
Categories (2, object): [test, train]

In [16]:
df1.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### Converting DataFrame to numpy array

In [17]:
df.to_numpy()

array([[ 0.55314246,  0.67900938, -0.54922815,  1.80464333],
       [-0.71818104,  0.33499589,  0.72261684,  0.50193289],
       [-1.79677165, -0.220665  ,  0.47831967,  0.7413944 ],
       [-0.63999307, -1.73622828, -0.16395829,  0.10901429],
       [ 0.90653681, -0.39356491, -0.19510729, -1.05329046],
       [ 0.55336638, -0.02827893,  0.69234188, -0.43538553]])

In [18]:
df1.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

#### it shows a quick statistic summary of your data:

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.190317,-0.227455,0.164164,0.278051
std,1.036381,0.834474,0.535762,0.990488
min,-1.796772,-1.736228,-0.549228,-1.05329
25%,-0.698634,-0.35034,-0.18732,-0.299286
50%,-0.043425,-0.124472,0.157181,0.305474
75%,0.55331,0.244177,0.638836,0.681529
max,0.906537,0.679009,0.722617,1.804643


#### Transpose

In [20]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [21]:
df.T

Unnamed: 0,2021-04-14,2021-04-15,2021-04-16,2021-04-17,2021-04-18,2021-04-19
A,0.553142,-0.718181,-1.796772,-0.639993,0.906537,0.553366
B,0.679009,0.334996,-0.220665,-1.736228,-0.393565,-0.028279
C,-0.549228,0.722617,0.47832,-0.163958,-0.195107,0.692342
D,1.804643,0.501933,0.741394,0.109014,-1.05329,-0.435386


#### Sorting by an axis:

In [22]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [23]:
df.sort_index(axis=1, ascending=False)    #axis=1 column wise

Unnamed: 0,D,C,B,A
2021-04-14,1.804643,-0.549228,0.679009,0.553142
2021-04-15,0.501933,0.722617,0.334996,-0.718181
2021-04-16,0.741394,0.47832,-0.220665,-1.796772
2021-04-17,0.109014,-0.163958,-1.736228,-0.639993
2021-04-18,-1.05329,-0.195107,-0.393565,0.906537
2021-04-19,-0.435386,0.692342,-0.028279,0.553366


#### Sorting by values:

In [24]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [25]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-19,0.553366,-0.028279,0.692342,-0.435386
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-14,0.553142,0.679009,-0.549228,1.804643


#### Getting: Selecting a single column, which yields a Series, equivalent to df.A:

In [26]:
df['A']          #series

2021-04-14    0.553142
2021-04-15   -0.718181
2021-04-16   -1.796772
2021-04-17   -0.639993
2021-04-18    0.906537
2021-04-19    0.553366
Freq: D, Name: A, dtype: float64

In [27]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394


In [28]:
df["20210414":"20210416"]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394


#### Selection by label

In [29]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [30]:
df.loc["20210414"]

A    0.553142
B    0.679009
C   -0.549228
D    1.804643
Name: 2021-04-14 00:00:00, dtype: float64

In [31]:
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643


In [32]:
df.loc["20210414":"20210416"]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394


In [33]:
df.loc[[dates[0],dates[2]]]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-16,-1.796772,-0.220665,0.47832,0.741394


In [34]:
df.loc[:,'A']

2021-04-14    0.553142
2021-04-15   -0.718181
2021-04-16   -1.796772
2021-04-17   -0.639993
2021-04-18    0.906537
2021-04-19    0.553366
Freq: D, Name: A, dtype: float64

In [35]:
df.loc[:,['A']]

Unnamed: 0,A
2021-04-14,0.553142
2021-04-15,-0.718181
2021-04-16,-1.796772
2021-04-17,-0.639993
2021-04-18,0.906537
2021-04-19,0.553366


In [36]:
df.loc[:,'A':'C']

Unnamed: 0,A,B,C
2021-04-14,0.553142,0.679009,-0.549228
2021-04-15,-0.718181,0.334996,0.722617
2021-04-16,-1.796772,-0.220665,0.47832
2021-04-17,-0.639993,-1.736228,-0.163958
2021-04-18,0.906537,-0.393565,-0.195107
2021-04-19,0.553366,-0.028279,0.692342


In [37]:
df.loc[:,['A','C']]

Unnamed: 0,A,C
2021-04-14,0.553142,-0.549228
2021-04-15,-0.718181,0.722617
2021-04-16,-1.796772,0.47832
2021-04-17,-0.639993,-0.163958
2021-04-18,0.906537,-0.195107
2021-04-19,0.553366,0.692342


In [38]:
df.loc["20210414":"20210416", ["A", "B"]]

Unnamed: 0,A,B
2021-04-14,0.553142,0.679009
2021-04-15,-0.718181,0.334996
2021-04-16,-1.796772,-0.220665


#### Selecting by integer or index

In [39]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [40]:
df.iloc[3]

A   -0.639993
B   -1.736228
C   -0.163958
D    0.109014
Name: 2021-04-17 00:00:00, dtype: float64

In [41]:
df.iloc[[3]]

Unnamed: 0,A,B,C,D
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014


In [42]:
df.iloc[1:4]

Unnamed: 0,A,B,C,D
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014


In [43]:
df.iloc[[1,3]]

Unnamed: 0,A,B,C,D
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014


In [44]:
df.iloc[:,1]

2021-04-14    0.679009
2021-04-15    0.334996
2021-04-16   -0.220665
2021-04-17   -1.736228
2021-04-18   -0.393565
2021-04-19   -0.028279
Freq: D, Name: B, dtype: float64

In [45]:
df.iloc[:,[1]]

Unnamed: 0,B
2021-04-14,0.679009
2021-04-15,0.334996
2021-04-16,-0.220665
2021-04-17,-1.736228
2021-04-18,-0.393565
2021-04-19,-0.028279


In [46]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2021-04-14,0.679009,-0.549228
2021-04-15,0.334996,0.722617
2021-04-16,-0.220665,0.47832
2021-04-17,-1.736228,-0.163958
2021-04-18,-0.393565,-0.195107
2021-04-19,-0.028279,0.692342


In [47]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
2021-04-14,0.679009,1.804643
2021-04-15,0.334996,0.501933
2021-04-16,-0.220665,0.741394
2021-04-17,-1.736228,0.109014
2021-04-18,-0.393565,-1.05329
2021-04-19,-0.028279,-0.435386


In [48]:
df.iloc[1:3,2]

2021-04-15    0.722617
2021-04-16    0.478320
Freq: D, Name: C, dtype: float64

In [49]:
df.iloc[2,1:3]

B   -0.220665
C    0.478320
Name: 2021-04-16 00:00:00, dtype: float64

In [50]:
df.iloc[1:3,1:3]

Unnamed: 0,B,C
2021-04-15,0.334996,0.722617
2021-04-16,-0.220665,0.47832


#### Boolean indexing

In [57]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


In [58]:
df>0

Unnamed: 0,A,B,C,D
2021-04-14,True,True,False,True
2021-04-15,False,True,True,True
2021-04-16,False,False,True,True
2021-04-17,False,False,False,True
2021-04-18,True,False,False,False
2021-04-19,True,False,True,False


In [59]:
df[df>0]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,,1.804643
2021-04-15,,0.334996,0.722617,0.501933
2021-04-16,,,0.47832,0.741394
2021-04-17,,,,0.109014
2021-04-18,0.906537,,,
2021-04-19,0.553366,,0.692342,


In [60]:
df.fillna(0)       #df.replace(np.NAN,0)

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-15,-0.718181,0.334996,0.722617,0.501933
2021-04-16,-1.796772,-0.220665,0.47832,0.741394
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329
2021-04-19,0.553366,-0.028279,0.692342,-0.435386


#### filtering

In [61]:
df2 = df.copy()

In [62]:
df2['E']=['zero','one','two','three','Four','five']

In [63]:
df2

Unnamed: 0,A,B,C,D,E
2021-04-14,0.553142,0.679009,-0.549228,1.804643,zero
2021-04-15,-0.718181,0.334996,0.722617,0.501933,one
2021-04-16,-1.796772,-0.220665,0.47832,0.741394,two
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014,three
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329,Four
2021-04-19,0.553366,-0.028279,0.692342,-0.435386,five


In [64]:
df[df2['E'].isin(['zero','two'])]

Unnamed: 0,A,B,C,D
2021-04-14,0.553142,0.679009,-0.549228,1.804643
2021-04-16,-1.796772,-0.220665,0.47832,0.741394


#### Setting a new column automatically aligns the data by the indexes.

In [65]:
s1 = pd.Series(data = [1,2,3,4,5,6], index=pd.date_range("20210414",periods=6))
s1

2021-04-14    1
2021-04-15    2
2021-04-16    3
2021-04-17    4
2021-04-18    5
2021-04-19    6
Freq: D, dtype: int64

In [66]:
df['F']= s1

In [67]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.553142,0.679009,-0.549228,1.804643,1
2021-04-15,-0.718181,0.334996,0.722617,0.501933,2
2021-04-16,-1.796772,-0.220665,0.47832,0.741394,3
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014,4
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329,5
2021-04-19,0.553366,-0.028279,0.692342,-0.435386,6


In [68]:
df.at[dates[0],'A']=0

In [69]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.679009,-0.549228,1.804643,1
2021-04-15,-0.718181,0.334996,0.722617,0.501933,2
2021-04-16,-1.796772,-0.220665,0.47832,0.741394,3
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014,4
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329,5
2021-04-19,0.553366,-0.028279,0.692342,-0.435386,6


In [70]:
df.iat[0,1]=0

In [71]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.549228,1.804643,1
2021-04-15,-0.718181,0.334996,0.722617,0.501933,2
2021-04-16,-1.796772,-0.220665,0.47832,0.741394,3
2021-04-17,-0.639993,-1.736228,-0.163958,0.109014,4
2021-04-18,0.906537,-0.393565,-0.195107,-1.05329,5
2021-04-19,0.553366,-0.028279,0.692342,-0.435386,6


In [72]:
df.iloc[:,3] = np.array([5]*len(df))

In [73]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.549228,5,1
2021-04-15,-0.718181,0.334996,0.722617,5,2
2021-04-16,-1.796772,-0.220665,0.47832,5,3
2021-04-17,-0.639993,-1.736228,-0.163958,5,4
2021-04-18,0.906537,-0.393565,-0.195107,5,5
2021-04-19,0.553366,-0.028279,0.692342,5,6


In [74]:
df2 = df.copy()

In [75]:
df2

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.549228,5,1
2021-04-15,-0.718181,0.334996,0.722617,5,2
2021-04-16,-1.796772,-0.220665,0.47832,5,3
2021-04-17,-0.639993,-1.736228,-0.163958,5,4
2021-04-18,0.906537,-0.393565,-0.195107,5,5
2021-04-19,0.553366,-0.028279,0.692342,5,6


In [76]:
df2[df2>0]=-df2

In [77]:
df2

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.549228,-5,-1
2021-04-15,-0.718181,-0.334996,-0.722617,-5,-2
2021-04-16,-1.796772,-0.220665,-0.47832,-5,-3
2021-04-17,-0.639993,-1.736228,-0.163958,-5,-4
2021-04-18,-0.906537,-0.393565,-0.195107,-5,-5
2021-04-19,-0.553366,-0.028279,-0.692342,-5,-6
