### 10 Minutes to Pandas
#https://pandas.pydata.org/pandas-docs/stable/10min.html

In [1]:
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt

In [2]:
s = pd.Series([1,3,5,np.nan,6,8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

Create a DataFrame by passing a NumPy array

In [3]:
dates = pd.date_range('20130101', periods = 6)

In [7]:
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [9]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))

In [10]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.420363,-1.314682,0.349653,1.449061
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-05,-2.226383,0.716004,0.053147,-0.307766
2013-01-06,0.076379,0.508477,0.203105,-0.424092


Creating a DataFrame by passing a dict of objects that can be converted to series-like.

In [11]:
df2 = pd.DataFrame({'A': 1,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D': np.array([3] * 4,dtype='int32'),
                    'E': pd.Categorical(['test','train','test','train']),
                    'F': 'foo'})

In [12]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
3,1,2013-01-02,1.0,3,train,foo


In [13]:
df2.dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [14]:
df.dtypes

A    float64
B    float64
C    float64
D    float64
dtype: object

In [16]:
df2.<TAB>

SyntaxError: invalid syntax (<ipython-input-16-915637deb483>, line 1)

### Viewing Data

In [22]:
df.head() #shows first five records

Unnamed: 0,A,B,C,D
2013-01-01,0.420363,-1.314682,0.349653,1.449061
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-05,-2.226383,0.716004,0.053147,-0.307766


In [23]:
df.tail() #shows last five records

Unnamed: 0,A,B,C,D
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-05,-2.226383,0.716004,0.053147,-0.307766
2013-01-06,0.076379,0.508477,0.203105,-0.424092


In [26]:
df.index # shows index field

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [27]:
df.columns # shows columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [28]:
df.values # shows values

array([[ 0.42036276, -1.31468213,  0.34965264,  1.4490608 ],
       [-0.75129867, -0.15952684, -0.02036685, -0.75186726],
       [ 1.03248327, -1.88059907, -0.69790482,  0.79347221],
       [-0.89753224, -0.00870663,  0.43416676,  0.42234819],
       [-2.22638337,  0.7160038 ,  0.05314696, -0.307766  ],
       [ 0.07637859,  0.50847732,  0.20310487, -0.42409151]])

#### describe() shows a quick statistic summary of your data:

In [29]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.390998,-0.356506,0.053633,0.196859
std,1.153957,1.029485,0.406217,0.838464
min,-2.226383,-1.880599,-0.697905,-0.751867
25%,-0.860974,-1.025893,-0.001988,-0.39501
50%,-0.33746,-0.084117,0.128126,0.057291
75%,0.334367,0.379181,0.313016,0.700691
max,1.032483,0.716004,0.434167,1.449061


#### Transposing your data:

In [30]:
df.T

Unnamed: 0,2013-01-01 00:00:00,2013-01-02 00:00:00,2013-01-03 00:00:00,2013-01-04 00:00:00,2013-01-05 00:00:00,2013-01-06 00:00:00
A,0.420363,-0.751299,1.032483,-0.897532,-2.226383,0.076379
B,-1.314682,-0.159527,-1.880599,-0.008707,0.716004,0.508477
C,0.349653,-0.020367,-0.697905,0.434167,0.053147,0.203105
D,1.449061,-0.751867,0.793472,0.422348,-0.307766,-0.424092


#### Sorting by an axis

In [33]:
df.sort_index(axis=1, ascending=True)

Unnamed: 0,A,B,C,D
2013-01-01,0.420363,-1.314682,0.349653,1.449061
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-05,-2.226383,0.716004,0.053147,-0.307766
2013-01-06,0.076379,0.508477,0.203105,-0.424092


In [36]:
# sort by axis 1
df.sort_index(axis=1, ascending=False)

Unnamed: 0,D,C,B,A
2013-01-01,1.449061,0.349653,-1.314682,0.420363
2013-01-02,-0.751867,-0.020367,-0.159527,-0.751299
2013-01-03,0.793472,-0.697905,-1.880599,1.032483
2013-01-04,0.422348,0.434167,-0.008707,-0.897532
2013-01-05,-0.307766,0.053147,0.716004,-2.226383
2013-01-06,-0.424092,0.203105,0.508477,0.076379


In [37]:
# sort by axis 0
df.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D
2013-01-06,0.076379,0.508477,0.203105,-0.424092
2013-01-05,-2.226383,0.716004,0.053147,-0.307766
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-01,0.420363,-1.314682,0.349653,1.449061


In [38]:
# sort by values
df.sort_values(by='B')

Unnamed: 0,A,B,C,D
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-01,0.420363,-1.314682,0.349653,1.449061
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-04,-0.897532,-0.008707,0.434167,0.422348
2013-01-06,0.076379,0.508477,0.203105,-0.424092
2013-01-05,-2.226383,0.716004,0.053147,-0.307766


### Selection

In [40]:
# Getting
## yields a Series, equivalent to df.A:
df['A']

2013-01-01    0.420363
2013-01-02   -0.751299
2013-01-03    1.032483
2013-01-04   -0.897532
2013-01-05   -2.226383
2013-01-06    0.076379
Freq: D, Name: A, dtype: float64

select via slicing

In [41]:
df[1:4]

Unnamed: 0,A,B,C,D
2013-01-02,-0.751299,-0.159527,-0.020367,-0.751867
2013-01-03,1.032483,-1.880599,-0.697905,0.793472
2013-01-04,-0.897532,-0.008707,0.434167,0.422348


#### Selection by label

In [42]:
df.loc[dates[0]]

A    0.420363
B   -1.314682
C    0.349653
D    1.449061
Name: 2013-01-01 00:00:00, dtype: float64

In [44]:
df.loc[:,['A','B']]

Unnamed: 0,A,B
2013-01-01,0.420363,-1.314682
2013-01-02,-0.751299,-0.159527
2013-01-03,1.032483,-1.880599
2013-01-04,-0.897532,-0.008707
2013-01-05,-2.226383,0.716004
2013-01-06,0.076379,0.508477


In [45]:
df.loc['20130102':'20130104',['A','B']]

Unnamed: 0,A,B
2013-01-02,-0.751299,-0.159527
2013-01-03,1.032483,-1.880599
2013-01-04,-0.897532,-0.008707


In [46]:
df.loc['20130102',['A','B']]

A   -0.751299
B   -0.159527
Name: 2013-01-02 00:00:00, dtype: float64

In [47]:
df.loc[dates[0],'A']

0.4203627620643398

In [48]:
df.at[dates[0],'A']

0.4203627620643398

#### Selection by Position

In [49]:
df.iloc[3]

A   -0.897532
B   -0.008707
C    0.434167
D    0.422348
Name: 2013-01-04 00:00:00, dtype: float64