In [49]:
import pandas as pd
import numpy as np

In [50]:
series = pd.Series([1,3, 5, 6, 8,  np.nan])
series

0    1.0
1    3.0
2    5.0
3    6.0
4    8.0
5    NaN
dtype: float64

In [51]:

df = pd.DataFrame({'A': [1,2,3,4,5,6,7,8,9],
                   'B': [11,12,13,14,15,16,17,18,19] 
})
df

Unnamed: 0,A,B
0,1,11
1,2,12
2,3,13
3,4,14
4,5,15
5,6,16
6,7,17
7,8,18
8,9,19


# Creating a DataFrame by passing a NumPy array with a datetime index using date_range() and labeled columns:

In [52]:
dates = pd.date_range("20130101", periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [53]:
dates.shape

(6,)

In [54]:
df = pd.DataFrame(np.random.randn(6, 5), index=dates, columns=list("ABCDF"))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


# Creating a DataFrame by passing a dictionary of objects where the keys are the column labels and the values are the column values.

In [55]:
df2 = pd.DataFrame(
    {
        "A": 2.0,
        "B": pd.Timestamp("20130102"),
        "C": pd.Series(1, index=list(range(4)), dtype="float32"),
        "D": np.array([3] * 4, dtype="int32"),
        "E": pd.Categorical(["test", "train", "test", "train"]),
        "F": "foo",
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,2.0,2013-01-02,1.0,3,test,foo
1,2.0,2013-01-02,1.0,3,train,foo
2,2.0,2013-01-02,1.0,3,test,foo
3,2.0,2013-01-02,1.0,3,train,foo


In [56]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

# Viewing data

In [57]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627


In [58]:
df.tail()

Unnamed: 0,A,B,C,D,F
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


In [59]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [60]:
df.columns

Index(['A', 'B', 'C', 'D', 'F'], dtype='object')

In [61]:
df.describe

<bound method NDFrame.describe of                    A         B         C         D         F
2013-01-01 -1.213839 -0.165859 -0.395933  0.250304 -1.447322
2013-01-02  0.958704  0.339191 -0.743983 -1.410221  0.523833
2013-01-03  1.071771  0.637111  2.184695  0.288887 -0.855488
2013-01-04  0.367246 -0.912911  0.490441 -0.112397  0.116640
2013-01-05 -0.482039 -0.206160 -0.256028 -0.665288  0.481627
2013-01-06  0.247592  1.663680 -0.362245 -0.298128  0.293684>

Return a NumPy representation of the underlying data with DataFrame.to_numpy() without the index or column labels: Numpy Array have one dtype for entire array while Dataframe have onedtype per column

In [62]:
df.to_numpy()

array([[-1.21383877, -0.16585884, -0.39593314,  0.25030351, -1.44732185],
       [ 0.95870415,  0.33919052, -0.74398301, -1.41022087,  0.52383286],
       [ 1.07177133,  0.63711093,  2.18469483,  0.28888745, -0.85548829],
       [ 0.36724634, -0.9129107 ,  0.49044067, -0.11239748,  0.11663961],
       [-0.48203931, -0.20616041, -0.25602763, -0.66528849,  0.48162742],
       [ 0.24759243,  1.66368013, -0.36224523, -0.29812817,  0.29368375]])

In [63]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [64]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
F    float64
dtype: object

In [65]:
df2.to_numpy()

array([[2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [2.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [66]:
df2.dtypes

A          float64
B    datetime64[s]
C          float32
D            int32
E         category
F           object
dtype: object

In [67]:
df.describe()

Unnamed: 0,A,B,C,D,F
count,6.0,6.0,6.0,6.0,6.0
mean,0.158239,0.225842,0.152824,-0.324474,-0.147838
std,0.874015,0.88161,1.075206,0.639883,0.812611
min,-1.213839,-0.912911,-0.743983,-1.410221,-1.447322
25%,-0.299631,-0.196085,-0.387511,-0.573498,-0.612456
50%,0.307419,0.086666,-0.309136,-0.205263,0.205162
75%,0.81084,0.562631,0.303824,0.159628,0.434642
max,1.071771,1.66368,2.184695,0.288887,0.523833


In [68]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627


## Transposing your data:

In [69]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-1.213839,0.958704,1.071771,0.367246,-0.482039,0.247592
B,-0.165859,0.339191,0.637111,-0.912911,-0.20616,1.66368
C,-0.395933,-0.743983,2.184695,0.490441,-0.256028,-0.362245
D,0.250304,-1.410221,0.288887,-0.112397,-0.665288,-0.298128
F,-1.447322,0.523833,-0.855488,0.11664,0.481627,0.293684


In [70]:
df.head()

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627


DataFrame.sort_index() sorts by an axis:

In [71]:
df.sort_index(axis=1, ascending=False)

Unnamed: 0,F,D,C,B,A
2013-01-01,-1.447322,0.250304,-0.395933,-0.165859,-1.213839
2013-01-02,0.523833,-1.410221,-0.743983,0.339191,0.958704
2013-01-03,-0.855488,0.288887,2.184695,0.637111,1.071771
2013-01-04,0.11664,-0.112397,0.490441,-0.912911,0.367246
2013-01-05,0.481627,-0.665288,-0.256028,-0.20616,-0.482039
2013-01-06,0.293684,-0.298128,-0.362245,1.66368,0.247592


In [72]:
df.sort_index(axis=0, ascending=True)

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


DataFrame.sort_values() sorts by values:

In [73]:
df.sort_values(by="B")

Unnamed: 0,A,B,C,D,F
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


# Getitem ([])

In [74]:
df[["A", "B", "C"]]

Unnamed: 0,A,B,C
2013-01-01,-1.213839,-0.165859,-0.395933
2013-01-02,0.958704,0.339191,-0.743983
2013-01-03,1.071771,0.637111,2.184695
2013-01-04,0.367246,-0.912911,0.490441
2013-01-05,-0.482039,-0.20616,-0.256028
2013-01-06,0.247592,1.66368,-0.362245


In [75]:
df["A"]

2013-01-01   -1.213839
2013-01-02    0.958704
2013-01-03    1.071771
2013-01-04    0.367246
2013-01-05   -0.482039
2013-01-06    0.247592
Freq: D, Name: A, dtype: float64

DataFrame Slice : selects matching rows:

In [76]:
df[0:4]

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664


In [77]:
df["20130102":"20130104"]

Unnamed: 0,A,B,C,D,F
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664


Selection by label. Selection by Label using DataFrame.loc() or DataFrame.at().

Selecting a row matching a label:

In [78]:
df.loc[dates[0]]

A   -1.213839
B   -0.165859
C   -0.395933
D    0.250304
F   -1.447322
Name: 2013-01-01 00:00:00, dtype: float64

Selecting all rows (:) with a select column labels:

In [79]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-1.213839,-0.165859
2013-01-02,0.958704,0.339191
2013-01-03,1.071771,0.637111
2013-01-04,0.367246,-0.912911
2013-01-05,-0.482039,-0.20616
2013-01-06,0.247592,1.66368


For label slicing, both endpoints are included:

In [80]:
df.loc["20130102":"20130104", ["A", "B"]]

Unnamed: 0,A,B
2013-01-02,0.958704,0.339191
2013-01-03,1.071771,0.637111
2013-01-04,0.367246,-0.912911


Selecting a single row and column label returns a scalar:

In [81]:
df.loc[dates[0], "A"]

-1.2138387708135214

Selecting a single row and column label returns a scalar:

In [82]:
df.at[dates[1], "A"]

0.95870414736906

# Selection by position

In [83]:
df.iloc[0:4]

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664


In [84]:
df.iloc[3]

A    0.367246
B   -0.912911
C    0.490441
D   -0.112397
F    0.116640
Name: 2013-01-04 00:00:00, dtype: float64

In [85]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,0.367246,-0.912911
2013-01-05,-0.482039,-0.20616


Lists of integer position locations:

In [86]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,0.958704,-0.743983
2013-01-03,1.071771,2.184695
2013-01-05,-0.482039,-0.256028


For slicing rows explicitly:

In [87]:
df.iloc[1:3, :]

Unnamed: 0,A,B,C,D,F
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488


For slicing columns explicitly:

In [88]:
df.iloc[:, 1:3]

Unnamed: 0,B,C
2013-01-01,-0.165859,-0.395933
2013-01-02,0.339191,-0.743983
2013-01-03,0.637111,2.184695
2013-01-04,-0.912911,0.490441
2013-01-05,-0.20616,-0.256028
2013-01-06,1.66368,-0.362245


For getting a value explicitly:

In [89]:
df.iat[1, 1]

0.3391905244649914

In [90]:
df.iat[1, 2]

-0.7439830132355628

# Boolean indexing
Select rows where df.A is greater than 0

In [91]:
df[df["A"] > 0]

Unnamed: 0,A,B,C,D,F
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


Selecting values from a DataFrame where a boolean condition is met:

In [92]:
df[df > 0.5]

Unnamed: 0,A,B,C,D,F
2013-01-01,,,,,
2013-01-02,0.958704,,,,0.523833
2013-01-03,1.071771,0.637111,2.184695,,
2013-01-04,,,,,
2013-01-05,,,,,
2013-01-06,,1.66368,,,


Using isin() method for filtering:

In [93]:
dfcopy = df.copy()
dfcopy

Unnamed: 0,A,B,C,D,F
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684


In [94]:
dfcopy = df.copy()
dfcopy["E"] = ["one", "one", "two", "three", "four", "three"]
dfcopy

Unnamed: 0,A,B,C,D,F,E
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322,one
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833,one
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488,two
2013-01-04,0.367246,-0.912911,0.490441,-0.112397,0.11664,three
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627,four
2013-01-06,0.247592,1.66368,-0.362245,-0.298128,0.293684,three


In [95]:
dfcopy[dfcopy["E"].isin(["one", "two", "four"])]


Unnamed: 0,A,B,C,D,F,E
2013-01-01,-1.213839,-0.165859,-0.395933,0.250304,-1.447322,one
2013-01-02,0.958704,0.339191,-0.743983,-1.410221,0.523833,one
2013-01-03,1.071771,0.637111,2.184695,0.288887,-0.855488,two
2013-01-05,-0.482039,-0.20616,-0.256028,-0.665288,0.481627,four


Setting a new column automatically aligns the data by the indexes:

In [100]:
s0 = pd.Series(["A", "B", "C", "D", "E", "F", "G"], index=pd.date_range("20130102", periods=7))
s0

2013-01-02    A
2013-01-03    B
2013-01-04    C
2013-01-05    D
2013-01-06    E
2013-01-07    F
2013-01-08    G
Freq: D, dtype: object

In [96]:
s1 = pd.Series(
    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    index=pd.date_range("20130102", periods=10))

s1

2013-01-02     1
2013-01-03     2
2013-01-04     3
2013-01-05     4
2013-01-06     5
2013-01-07     6
2013-01-08     7
2013-01-09     8
2013-01-10     9
2013-01-11    10
Freq: D, dtype: int64