In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# One dimensional data format
s = pd.Series([1,3,5,np.nan,6,8])

In [3]:
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
s.dtype

dtype('float64')

In [4]:
# Create a DataFrame
df = pd.DataFrame({ 'A' : 1.,
                    'B' : pd.Timestamp('20130102'),
                    'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
                    'D' : np.array([3] * 4,dtype='int32'),
                    'E' : pd.Categorical(["test","train","test","train"]),
                    'F' : ['one', 'one','two','three'],
                    'G' : 'foo'
                  })

In [5]:
df

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,one,foo
1,1.0,2013-01-02,1.0,3,train,one,foo
2,1.0,2013-01-02,1.0,3,test,two,foo
3,1.0,2013-01-02,1.0,3,train,three,foo


In [8]:
df.E.cat.codes

0    0
1    1
2    0
3    1
dtype: int8

In [6]:
df.dtypes

A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
G            object
dtype: object

In [8]:
# Basics

In [9]:
df.head(2)

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,one,foo
1,1.0,2013-01-02,1.0,3,train,one,foo


In [10]:
df.tail(3)

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,one,foo
2,1.0,2013-01-02,1.0,3,test,two,foo
3,1.0,2013-01-02,1.0,3,train,three,foo


In [11]:
df.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [12]:
# Get one column

In [13]:
df['A']

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [14]:
df.A

0    1.0
1    1.0
2    1.0
3    1.0
Name: A, dtype: float64

In [9]:
# Slicing, similar to indexing lists
df[:2]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,one,foo
1,1.0,2013-01-02,1.0,3,train,one,foo


In [16]:
df.loc[0:2, ['A']]

Unnamed: 0,A
0,1.0
1,1.0
2,1.0


In [10]:
df.loc[:, ["A", "B"]]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02
3,1.0,2013-01-02


In [13]:
cols = ["A", "B"]
df[cols]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02
3,1.0,2013-01-02


In [18]:
# selection by position, often more error prone

In [19]:
df.iloc[:3, :2]

Unnamed: 0,A,B
0,1.0,2013-01-02
1,1.0,2013-01-02
2,1.0,2013-01-02


In [20]:
# Compared to R, there is no negative indexing (in the sense that you can remove columns).

In [11]:
# Get last row
df[-1:]

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,three,foo


In [22]:
# Get last row using tail()
df.tail(1)

Unnamed: 0,A,B,C,D,E,F,G
3,1.0,2013-01-02,1.0,3,train,three,foo


In [23]:
# Get last 2 columns
df.iloc[:,-2:]

Unnamed: 0,F,G
0,one,foo
1,one,foo
2,two,foo
3,three,foo


In [24]:
# Access to a scalar
df.iat[1,2]

1.0

In [25]:
# Boolean indexing

In [26]:
df[df.A > 0]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,one,foo
1,1.0,2013-01-02,1.0,3,train,one,foo
2,1.0,2013-01-02,1.0,3,test,two,foo
3,1.0,2013-01-02,1.0,3,train,three,foo


In [27]:
df[(df.A > 0) & (df.E == 'train')]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,one,foo
3,1.0,2013-01-02,1.0,3,train,three,foo


In [14]:
bool1 = df.A > 0
bool2 = df.E == 'train'
df[bool1 & bool2]

Unnamed: 0,A,B,C,D,E,F,G
1,1.0,2013-01-02,1.0,3,train,one,foo
3,1.0,2013-01-02,1.0,3,train,three,foo


In [28]:
# Also possible but more verbose
inx = np.where(df.E == 'test')
df.iloc[inx]

Unnamed: 0,A,B,C,D,E,F,G
0,1.0,2013-01-02,1.0,3,test,one,foo
2,1.0,2013-01-02,1.0,3,test,two,foo


In [15]:
# Create a dates sequence
dates = pd.date_range('20180101', periods=4)

In [16]:
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04'], dtype='datetime64[ns]', freq='D')

In [17]:
# Setting
df['date'] = dates

In [18]:
df

Unnamed: 0,A,B,C,D,E,F,G,date
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04


In [19]:
# = doesn't create a copy
df2 = df

In [20]:
df2

Unnamed: 0,A,B,C,D,E,F,G,date
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04


In [21]:
df['H'] = 'baz'

In [22]:
df

Unnamed: 0,A,B,C,D,E,F,G,date,H
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01,baz
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02,baz
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03,baz
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04,baz


In [23]:
df2

Unnamed: 0,A,B,C,D,E,F,G,date,H
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01,baz
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02,baz
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03,baz
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04,baz


In [24]:
# We create a copy explicitely
df2 = df.copy()

In [25]:
df['J'] = 'zaz'

In [26]:
# Has no J column
df2

Unnamed: 0,A,B,C,D,E,F,G,date,H
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01,baz
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02,baz
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03,baz
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04,baz


In [27]:
df

Unnamed: 0,A,B,C,D,E,F,G,date,H,J
0,1.0,2013-01-02,1.0,3,test,one,foo,2018-01-01,baz,zaz
1,1.0,2013-01-02,1.0,3,train,one,foo,2018-01-02,baz,zaz
2,1.0,2013-01-02,1.0,3,test,two,foo,2018-01-03,baz,zaz
3,1.0,2013-01-02,1.0,3,train,three,foo,2018-01-04,baz,zaz


In [28]:
df = pd.DataFrame(np.random.randn(10, 4))

In [29]:
df

Unnamed: 0,0,1,2,3
0,-0.375114,-0.75714,0.438139,1.063348
1,0.678378,-0.702553,0.842154,0.071476
2,0.048625,-0.940971,0.269366,2.526167
3,-1.698351,0.114846,1.570141,-0.378996
4,0.766658,-0.107647,-0.001265,-0.758371
5,0.374611,-1.299619,0.620896,-1.306438
6,-0.028997,0.621155,-1.294431,-0.074853
7,0.355255,1.485345,-0.008332,0.747507
8,-1.018134,0.771222,-0.956756,-2.190845
9,-1.40504,-0.565388,-1.984435,-0.776766


In [30]:
# Apply a function to all columns
df.apply(np.mean)

0   -0.230211
1   -0.138075
2   -0.050452
3   -0.107777
dtype: float64

In [34]:
def var_range(x):
    return np.max(x) - np.min(x)

df.apply(var_range, axis=0)

0    2.465009
1    2.784965
2    3.554576
3    4.717012
dtype: float64

In [35]:
df.apply(lambda x: np.max(x) - np.min(x), axis=0)

0    2.465009
1    2.784965
2    3.554576
3    4.717012
dtype: float64

In [54]:
x = df.iloc[:,0]

def standardize(x):
    return (x - np.mean(x) ) / np.std(x)

df['std0'] = standardize(df.iloc[:,0])

In [55]:
df

Unnamed: 0,0,1,2,3,std0
0,-0.375114,-0.75714,0.438139,1.063348,-0.175416
1,0.678378,-0.702553,0.842154,0.071476,1.099921
2,0.048625,-0.940971,0.269366,2.526167,0.337554
3,-1.698351,0.114846,1.570141,-0.378996,-1.777303
4,0.766658,-0.107647,-0.001265,-0.758371,1.206791
5,0.374611,-1.299619,0.620896,-1.306438,0.732187
6,-0.028997,0.621155,-1.294431,-0.074853,0.243585
7,0.355255,1.485345,-0.008332,0.747507,0.708754
8,-1.018134,0.771222,-0.956756,-2.190845,-0.953845
9,-1.40504,-0.565388,-1.984435,-0.776766,-1.422226


In [56]:
df.std0.mean()

0.0

In [57]:
df.std0.std()

1.0540925533894598

In [58]:
# Counts of a variable
df2.F.value_counts()

one      2
three    1
two      1
Name: F, dtype: int64

In [61]:
df2.F.values

array(['one', 'one', 'two', 'three'], dtype=object)

In [62]:
list(df2.F.values)

['one', 'one', 'two', 'three']