In [1]:
import numpy as np
import pandas as pd

#### Creating a Series by passing a list of values

In [2]:
pd.Series(data=[1,3,5,7,np.nan,8,9])

0    1.0
1    3.0
2    5.0
3    7.0
4    NaN
5    8.0
6    9.0
dtype: float64

#### Creating a DataFrame by passing a Numpy array with date time index and tabled columns

In [3]:
dates = pd.date_range("20210414",periods=6)
dates

DatetimeIndex(['2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17',
               '2021-04-18', '2021-04-19'],
              dtype='datetime64[ns]', freq='D')

In [4]:
data = np.random.randn(6,4)
data

array([[ 0.26453976,  0.46907406, -0.42313207, -1.51849486],
       [ 0.66643402, -1.20924259,  0.38812851,  1.95490425],
       [ 0.36190572, -0.04840396,  2.49760838,  0.38887794],
       [-0.47324833, -0.51393514,  1.86676604, -0.29506366],
       [ 0.78172218,  2.39554958, -0.70527088, -1.32206065],
       [ 0.08595353, -0.4345028 , -0.29090704, -0.78599001]])

In [5]:
col = ['A','B','C','D']
col

['A', 'B', 'C', 'D']

In [6]:
df = pd.DataFrame(data=data, index=dates, columns=col)
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [7]:
df.head()

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061


In [8]:
df.tail()

Unnamed: 0,A,B,C,D
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [9]:
df.index

DatetimeIndex(['2021-04-14', '2021-04-15', '2021-04-16', '2021-04-17',
               '2021-04-18', '2021-04-19'],
              dtype='datetime64[ns]', freq='D')

In [10]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

#### Creating a DataFrame by passing a dict of objects that can be converted into series

In [11]:
df1 = pd.DataFrame(
        {
            "A": 1.0,
            "B": pd.Timestamp("20130102"),
            "C": pd.Series(1, index=list(range(4)), dtype="float32"),
            "D": np.array([3] * 4, dtype="int32"),
            "E": pd.Categorical(["test", "train", "test", "train"]),
            "F": "foo",
        }
    )
df1

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


In [12]:
pd.Timestamp("20130102")

Timestamp('2013-01-02 00:00:00')

In [13]:
pd.Series(1, index=list(range(4)), dtype="float32")

0    1.0
1    1.0
2    1.0
3    1.0
dtype: float32

In [14]:
np.array([3] * 4, dtype="int32")

array([3, 3, 3, 3])

In [15]:
pd.Categorical(["test", "train", "test", "train"])

[test, train, test, train]
Categories (2, object): [test, train]

In [16]:
df1.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

#### Converting DataFrame to numpy array

In [17]:
df.to_numpy()

array([[ 0.26453976,  0.46907406, -0.42313207, -1.51849486],
       [ 0.66643402, -1.20924259,  0.38812851,  1.95490425],
       [ 0.36190572, -0.04840396,  2.49760838,  0.38887794],
       [-0.47324833, -0.51393514,  1.86676604, -0.29506366],
       [ 0.78172218,  2.39554958, -0.70527088, -1.32206065],
       [ 0.08595353, -0.4345028 , -0.29090704, -0.78599001]])

In [18]:
df1.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

#### it shows a quick statistic summary of your data:

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.281218,0.109757,0.555532,-0.262971
std,0.450118,1.249315,1.325359,1.289687
min,-0.473248,-1.209243,-0.705271,-1.518495
25%,0.1306,-0.494077,-0.390076,-1.188043
50%,0.313223,-0.241453,0.048611,-0.540527
75%,0.590302,0.339705,1.497107,0.217893
max,0.781722,2.39555,2.497608,1.954904


#### Transpose

In [20]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [21]:
df.T

Unnamed: 0,2021-04-14,2021-04-15,2021-04-16,2021-04-17,2021-04-18,2021-04-19
A,0.26454,0.666434,0.361906,-0.473248,0.781722,0.085954
B,0.469074,-1.209243,-0.048404,-0.513935,2.39555,-0.434503
C,-0.423132,0.388129,2.497608,1.866766,-0.705271,-0.290907
D,-1.518495,1.954904,0.388878,-0.295064,-1.322061,-0.78599


#### Sorting by an axis:

In [22]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [23]:
df.sort_index(axis=1, ascending=False)    #axis=1 column wise

Unnamed: 0,D,C,B,A
2021-04-14,-1.518495,-0.423132,0.469074,0.26454
2021-04-15,1.954904,0.388129,-1.209243,0.666434
2021-04-16,0.388878,2.497608,-0.048404,0.361906
2021-04-17,-0.295064,1.866766,-0.513935,-0.473248
2021-04-18,-1.322061,-0.705271,2.39555,0.781722
2021-04-19,-0.78599,-0.290907,-0.434503,0.085954


#### Sorting by values:

In [24]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [25]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-18,0.781722,2.39555,-0.705271,-1.322061


#### Getting: Selecting a single column, which yields a Series, equivalent to df.A:

In [26]:
df['A']          #series

2021-04-14    0.264540
2021-04-15    0.666434
2021-04-16    0.361906
2021-04-17   -0.473248
2021-04-18    0.781722
2021-04-19    0.085954
Freq: D, Name: A, dtype: float64

In [27]:
df[0:3]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878


In [28]:
df["20210414":"20210416"]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878


#### Selection by label

In [29]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [30]:
df.loc["20210414"]

A    0.264540
B    0.469074
C   -0.423132
D   -1.518495
Name: 2021-04-14 00:00:00, dtype: float64

In [31]:
df.loc[[dates[0]]]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495


In [32]:
df.loc["20210414":"20210416"]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878


In [33]:
df.loc[[dates[0],dates[2]]]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-16,0.361906,-0.048404,2.497608,0.388878


In [34]:
df.loc[:,'A']

2021-04-14    0.264540
2021-04-15    0.666434
2021-04-16    0.361906
2021-04-17   -0.473248
2021-04-18    0.781722
2021-04-19    0.085954
Freq: D, Name: A, dtype: float64

In [35]:
df.loc[:,['A']]

Unnamed: 0,A
2021-04-14,0.26454
2021-04-15,0.666434
2021-04-16,0.361906
2021-04-17,-0.473248
2021-04-18,0.781722
2021-04-19,0.085954


In [36]:
df.loc[:,'A':'C']

Unnamed: 0,A,B,C
2021-04-14,0.26454,0.469074,-0.423132
2021-04-15,0.666434,-1.209243,0.388129
2021-04-16,0.361906,-0.048404,2.497608
2021-04-17,-0.473248,-0.513935,1.866766
2021-04-18,0.781722,2.39555,-0.705271
2021-04-19,0.085954,-0.434503,-0.290907


In [37]:
df.loc[:,['A','C']]

Unnamed: 0,A,C
2021-04-14,0.26454,-0.423132
2021-04-15,0.666434,0.388129
2021-04-16,0.361906,2.497608
2021-04-17,-0.473248,1.866766
2021-04-18,0.781722,-0.705271
2021-04-19,0.085954,-0.290907


In [38]:
df.loc["20210414":"20210416", ["A", "B"]]

Unnamed: 0,A,B
2021-04-14,0.26454,0.469074
2021-04-15,0.666434,-1.209243
2021-04-16,0.361906,-0.048404


#### Selecting by integer or index

In [39]:
df

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [40]:
df.iloc[3]

A   -0.473248
B   -0.513935
C    1.866766
D   -0.295064
Name: 2021-04-17 00:00:00, dtype: float64

In [41]:
df.iloc[[3]]

Unnamed: 0,A,B,C,D
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064


In [42]:
df.iloc[1:4]

Unnamed: 0,A,B,C,D
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064


In [43]:
df.iloc[[1,3]]

Unnamed: 0,A,B,C,D
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064


In [44]:
df.iloc[:,1]

2021-04-14    0.469074
2021-04-15   -1.209243
2021-04-16   -0.048404
2021-04-17   -0.513935
2021-04-18    2.395550
2021-04-19   -0.434503
Freq: D, Name: B, dtype: float64

In [45]:
df.iloc[:,[1]]

Unnamed: 0,B
2021-04-14,0.469074
2021-04-15,-1.209243
2021-04-16,-0.048404
2021-04-17,-0.513935
2021-04-18,2.39555
2021-04-19,-0.434503


In [46]:
df.iloc[:,1:3]

Unnamed: 0,B,C
2021-04-14,0.469074,-0.423132
2021-04-15,-1.209243,0.388129
2021-04-16,-0.048404,2.497608
2021-04-17,-0.513935,1.866766
2021-04-18,2.39555,-0.705271
2021-04-19,-0.434503,-0.290907


In [47]:
df.iloc[:,[1,3]]

Unnamed: 0,B,D
2021-04-14,0.469074,-1.518495
2021-04-15,-1.209243,1.954904
2021-04-16,-0.048404,0.388878
2021-04-17,-0.513935,-0.295064
2021-04-18,2.39555,-1.322061
2021-04-19,-0.434503,-0.78599


In [48]:
df.iloc[1:3,2]

2021-04-15    0.388129
2021-04-16    2.497608
Freq: D, Name: C, dtype: float64

In [49]:
df.iloc[2,1:3]

B   -0.048404
C    2.497608
Name: 2021-04-16 00:00:00, dtype: float64

In [50]:
df.iloc[1:3,1:3]

Unnamed: 0,B,C
2021-04-15,-1.209243,0.388129
2021-04-16,-0.048404,2.497608


#### Boolean indexing

In [55]:
df[df['A']>0]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-15,0.666434,-1.209243,0.388129,1.954904
2021-04-16,0.361906,-0.048404,2.497608,0.388878
2021-04-18,0.781722,2.39555,-0.705271,-1.322061
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599


In [58]:
df>0

Unnamed: 0,A,B,C,D
2021-04-14,True,True,False,False
2021-04-15,True,False,True,True
2021-04-16,True,False,True,True
2021-04-17,False,False,True,False
2021-04-18,True,True,False,False
2021-04-19,True,False,False,False


In [59]:
df[df>0]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,,
2021-04-15,0.666434,,0.388129,1.954904
2021-04-16,0.361906,,2.497608,0.388878
2021-04-17,,,1.866766,
2021-04-18,0.781722,2.39555,,
2021-04-19,0.085954,,,


#### filtering

In [60]:
df2 = df.copy()

In [61]:
df2['E']=['zero','one','two','three','Four','five']

In [62]:
df2

Unnamed: 0,A,B,C,D,E
2021-04-14,0.26454,0.469074,-0.423132,-1.518495,zero
2021-04-15,0.666434,-1.209243,0.388129,1.954904,one
2021-04-16,0.361906,-0.048404,2.497608,0.388878,two
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064,three
2021-04-18,0.781722,2.39555,-0.705271,-1.322061,Four
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599,five


In [65]:
df[df2['E'].isin(['zero','two'])]

Unnamed: 0,A,B,C,D
2021-04-14,0.26454,0.469074,-0.423132,-1.518495
2021-04-16,0.361906,-0.048404,2.497608,0.388878


#### Setting a new column automatically aligns the data by the indexes.

In [71]:
s1 = pd.Series(data = [1,2,3,4,5,6], index=pd.date_range("20210414",periods=6))
s1

2021-04-14    1
2021-04-15    2
2021-04-16    3
2021-04-17    4
2021-04-18    5
2021-04-19    6
Freq: D, dtype: int64

In [72]:
df['F']= s1

In [73]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.26454,0.469074,-0.423132,-1.518495,1
2021-04-15,0.666434,-1.209243,0.388129,1.954904,2
2021-04-16,0.361906,-0.048404,2.497608,0.388878,3
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064,4
2021-04-18,0.781722,2.39555,-0.705271,-1.322061,5
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599,6


In [75]:
df.at[dates[0],'A']=0

In [76]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.469074,-0.423132,-1.518495,1
2021-04-15,0.666434,-1.209243,0.388129,1.954904,2
2021-04-16,0.361906,-0.048404,2.497608,0.388878,3
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064,4
2021-04-18,0.781722,2.39555,-0.705271,-1.322061,5
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599,6


In [79]:
df.iat[0,1]=0

In [80]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.423132,-1.518495,1
2021-04-15,0.666434,-1.209243,0.388129,1.954904,2
2021-04-16,0.361906,-0.048404,2.497608,0.388878,3
2021-04-17,-0.473248,-0.513935,1.866766,-0.295064,4
2021-04-18,0.781722,2.39555,-0.705271,-1.322061,5
2021-04-19,0.085954,-0.434503,-0.290907,-0.78599,6


In [84]:
df.iloc[:,3] = np.array([5]*len(df))

In [85]:
df

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.423132,5,1
2021-04-15,0.666434,-1.209243,0.388129,5,2
2021-04-16,0.361906,-0.048404,2.497608,5,3
2021-04-17,-0.473248,-0.513935,1.866766,5,4
2021-04-18,0.781722,2.39555,-0.705271,5,5
2021-04-19,0.085954,-0.434503,-0.290907,5,6


In [86]:
df2 = df.copy()

In [87]:
df2

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.423132,5,1
2021-04-15,0.666434,-1.209243,0.388129,5,2
2021-04-16,0.361906,-0.048404,2.497608,5,3
2021-04-17,-0.473248,-0.513935,1.866766,5,4
2021-04-18,0.781722,2.39555,-0.705271,5,5
2021-04-19,0.085954,-0.434503,-0.290907,5,6


In [90]:
df2[df2>0]=-df2

In [91]:
df2

Unnamed: 0,A,B,C,D,F
2021-04-14,0.0,0.0,-0.423132,-5,-1
2021-04-15,-0.666434,-1.209243,-0.388129,-5,-2
2021-04-16,-0.361906,-0.048404,-2.497608,-5,-3
2021-04-17,-0.473248,-0.513935,-1.866766,-5,-4
2021-04-18,-0.781722,-2.39555,-0.705271,-5,-5
2021-04-19,-0.085954,-0.434503,-0.290907,-5,-6
