# 10 Minutes to Pandas

In [1]:
import numpy as np
import pandas as pd

## Object Creation
make a series

In [2]:
s = pd.Series([1,3,5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

make a Data Frame

In [3]:
dates = pd.date_range('2013-01-01', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [4]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns = list("ABCD"))
df

Unnamed: 0,A,B,C,D
2013-01-01,1.170761,1.391132,-0.937193,0.283856
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145
2013-01-03,1.747487,0.488112,0.09436,0.850443
2013-01-04,2.021094,-0.728123,-1.014076,0.771039
2013-01-05,-0.278288,-0.327722,2.422124,0.278236
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646


make a DataFrame with a dictionary

In [5]:
df2 = pd.DataFrame(
    {
        "A": (1, 2, 3, 4), #works with [] or ()
        "B": pd.Timestamp("2013-01-02"),
        "C": pd.Series(3, index=list(range(4)), dtype='float32'), #list of 4 3's
        "D": np.array([2]*4, dtype = 'int32'),
        "E": pd.Categorical(['test', 'train','test','trains']),
        "F": 'foo',
    }
)
df2

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,3.0,2,test,foo
1,2,2013-01-02,3.0,2,train,foo
2,3,2013-01-02,3.0,2,test,foo
3,4,2013-01-02,3.0,2,trains,foo


In [6]:
df2.dtypes  #colums have different dtypes

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [7]:
df2.E

0      test
1     train
2      test
3    trains
Name: E, dtype: category
Categories (3, object): ['test', 'train', 'trains']

## Viewing data

In [8]:
df.head(2)

Unnamed: 0,A,B,C,D
2013-01-01,1.170761,1.391132,-0.937193,0.283856
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145


In [9]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,2.021094,-0.728123,-1.014076,0.771039
2013-01-05,-0.278288,-0.327722,2.422124,0.278236
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646


In [10]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [11]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [12]:
df.to_numpy()  #will find the numpy dtype that can hold all dtypes of the dataframe
               #may end up casting all to object
               #does not include index or column labels

array([[ 1.17076108,  1.39113194, -0.93719292,  0.28385617],
       [-0.71431873, -0.42539994, -0.30982171, -0.03414478],
       [ 1.74748667,  0.48811166,  0.09436006,  0.85044288],
       [ 2.021094  , -0.72812288, -1.01407587,  0.77103889],
       [-0.27828767, -0.3277223 ,  2.42212383,  0.27823638],
       [-0.3613634 ,  0.69699576, -0.50044568, -0.93664623]])

In [13]:
df.describe()  #quick summary

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,0.597562,0.182499,-0.040842,0.202131
std,1.190379,0.809691,1.27426,0.649473
min,-0.714319,-0.728123,-1.014076,-0.936646
25%,-0.340594,-0.400981,-0.828006,0.043951
50%,0.446237,0.080195,-0.405134,0.281046
75%,1.603305,0.644775,-0.006685,0.649243
max,2.021094,1.391132,2.422124,0.850443


In [14]:
df.T  #transpose

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,1.170761,-0.714319,1.747487,2.021094,-0.278288,-0.361363
B,1.391132,-0.4254,0.488112,-0.728123,-0.327722,0.696996
C,-0.937193,-0.309822,0.09436,-1.014076,2.422124,-0.500446
D,0.283856,-0.034145,0.850443,0.771039,0.278236,-0.936646


In [15]:
df.sort_index(axis = 1, ascending = False)

Unnamed: 0,D,C,B,A
2013-01-01,0.283856,-0.937193,1.391132,1.170761
2013-01-02,-0.034145,-0.309822,-0.4254,-0.714319
2013-01-03,0.850443,0.09436,0.488112,1.747487
2013-01-04,0.771039,-1.014076,-0.728123,2.021094
2013-01-05,0.278236,2.422124,-0.327722,-0.278288
2013-01-06,-0.936646,-0.500446,0.696996,-0.361363


In [16]:
df.sort_values(by="C")  #yay, sorting

Unnamed: 0,A,B,C,D
2013-01-04,2.021094,-0.728123,-1.014076,0.771039
2013-01-01,1.170761,1.391132,-0.937193,0.283856
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145
2013-01-03,1.747487,0.488112,0.09436,0.850443
2013-01-05,-0.278288,-0.327722,2.422124,0.278236


## Selection

In [17]:
df["A"] == df.A

2013-01-01    True
2013-01-02    True
2013-01-03    True
2013-01-04    True
2013-01-05    True
2013-01-06    True
Freq: D, Name: A, dtype: bool

In [18]:
df["A"]

2013-01-01    1.170761
2013-01-02   -0.714319
2013-01-03    1.747487
2013-01-04    2.021094
2013-01-05   -0.278288
2013-01-06   -0.361363
Freq: D, Name: A, dtype: float64

In [19]:
df.A

2013-01-01    1.170761
2013-01-02   -0.714319
2013-01-03    1.747487
2013-01-04    2.021094
2013-01-05   -0.278288
2013-01-06   -0.361363
Freq: D, Name: A, dtype: float64

In [20]:
df[1:4]

Unnamed: 0,A,B,C,D
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145
2013-01-03,1.747487,0.488112,0.09436,0.850443
2013-01-04,2.021094,-0.728123,-1.014076,0.771039


In [21]:
df['2013-01-01':'2013-01-03']

Unnamed: 0,A,B,C,D
2013-01-01,1.170761,1.391132,-0.937193,0.283856
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145
2013-01-03,1.747487,0.488112,0.09436,0.850443


In [22]:
df.loc[dates[0]]   #remember dates is the array for the index

A    1.170761
B    1.391132
C   -0.937193
D    0.283856
Name: 2013-01-01 00:00:00, dtype: float64

In [23]:
df.loc[:,["A", "B"]]

Unnamed: 0,A,B
2013-01-01,1.170761,1.391132
2013-01-02,-0.714319,-0.4254
2013-01-03,1.747487,0.488112
2013-01-04,2.021094,-0.728123
2013-01-05,-0.278288,-0.327722
2013-01-06,-0.361363,0.696996


In [24]:
df.loc['20130102':"2013-01-04", ["B", 'C']]  #probably terrible form to do this, but different styles work
      #both endpoints included

Unnamed: 0,B,C
2013-01-02,-0.4254,-0.309822
2013-01-03,0.488112,0.09436
2013-01-04,-0.728123,-1.014076


In [25]:
df.loc["20130102", ["A", "B"]] #dimensions reduced

A   -0.714319
B   -0.425400
Name: 2013-01-02 00:00:00, dtype: float64

In [26]:
df.loc[dates[0], 'A'] #just get the value

1.1707610798971928

In [27]:
df.at[dates[0],'A']  #fast access (why fast?)

1.1707610798971928

### Selection by position

In [28]:
df.iloc[3]

A    2.021094
B   -0.728123
C   -1.014076
D    0.771039
Name: 2013-01-04 00:00:00, dtype: float64

In [29]:
df.iloc[3:5, 1:3] #last endpoint excluded

Unnamed: 0,B,C
2013-01-04,-0.728123,-1.014076
2013-01-05,-0.327722,2.422124


In [30]:
df.iloc[[1,2,4], [0,2]]

Unnamed: 0,A,C
2013-01-02,-0.714319,-0.309822
2013-01-03,1.747487,0.09436
2013-01-05,-0.278288,2.422124


In [31]:
df.iloc[3:5, :] #rows

Unnamed: 0,A,B,C,D
2013-01-04,2.021094,-0.728123,-1.014076,0.771039
2013-01-05,-0.278288,-0.327722,2.422124,0.278236


In [32]:
df.iloc[:, 1:3] #columns

Unnamed: 0,B,C
2013-01-01,1.391132,-0.937193
2013-01-02,-0.4254,-0.309822
2013-01-03,0.488112,0.09436
2013-01-04,-0.728123,-1.014076
2013-01-05,-0.327722,2.422124
2013-01-06,0.696996,-0.500446


In [33]:
%%time
df.iloc[4,2]

Wall time: 0 ns


2.4221238252735

In [34]:
%%time
df.iat[4,2] #fast access  (why fast??)

Wall time: 0 ns


2.4221238252735

## Boolean Indexing

In [35]:
df[df["A"]>0]  #just those rows

Unnamed: 0,A,B,C,D
2013-01-01,1.170761,1.391132,-0.937193,0.283856
2013-01-03,1.747487,0.488112,0.09436,0.850443
2013-01-04,2.021094,-0.728123,-1.014076,0.771039


In [36]:
df[df>0]  #just those values

Unnamed: 0,A,B,C,D
2013-01-01,1.170761,1.391132,,0.283856
2013-01-02,,,,
2013-01-03,1.747487,0.488112,0.09436,0.850443
2013-01-04,2.021094,,,0.771039
2013-01-05,,,2.422124,0.278236
2013-01-06,,0.696996,,


In [37]:
df1 = df.copy()
df1["E"] = ['one','one','two','two','three','four']
df1

Unnamed: 0,A,B,C,D,E
2013-01-01,1.170761,1.391132,-0.937193,0.283856,one
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145,one
2013-01-03,1.747487,0.488112,0.09436,0.850443,two
2013-01-04,2.021094,-0.728123,-1.014076,0.771039,two
2013-01-05,-0.278288,-0.327722,2.422124,0.278236,three
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646,four


In [38]:
df1[df1['E'].isin(['two', 'four'])]

Unnamed: 0,A,B,C,D,E
2013-01-03,1.747487,0.488112,0.09436,0.850443,two
2013-01-04,2.021094,-0.728123,-1.014076,0.771039,two
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646,four


## Setting 

In [39]:
s1 = pd.Series([1,2,3,4,5,6],index = pd.date_range('20130101',periods = 6))
s1

2013-01-01    1
2013-01-02    2
2013-01-03    3
2013-01-04    4
2013-01-05    5
2013-01-06    6
Freq: D, dtype: int64

In [40]:
df['F'] = s1
df

Unnamed: 0,A,B,C,D,F
2013-01-01,1.170761,1.391132,-0.937193,0.283856,1
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145,2
2013-01-03,1.747487,0.488112,0.09436,0.850443,3
2013-01-04,2.021094,-0.728123,-1.014076,0.771039,4
2013-01-05,-0.278288,-0.327722,2.422124,0.278236,5
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646,6


In [41]:
df.at[dates[0], "A"] = 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,1.391132,-0.937193,0.283856,1
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145,2
2013-01-03,1.747487,0.488112,0.09436,0.850443,3
2013-01-04,2.021094,-0.728123,-1.014076,0.771039,4
2013-01-05,-0.278288,-0.327722,2.422124,0.278236,5
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646,6


In [42]:
df.iat[0,1]= 0
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.937193,0.283856,1
2013-01-02,-0.714319,-0.4254,-0.309822,-0.034145,2
2013-01-03,1.747487,0.488112,0.09436,0.850443,3
2013-01-04,2.021094,-0.728123,-1.014076,0.771039,4
2013-01-05,-0.278288,-0.327722,2.422124,0.278236,5
2013-01-06,-0.361363,0.696996,-0.500446,-0.936646,6


In [43]:
df.loc[:,"D"] = np.array([5]*len(df))
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.937193,5,1
2013-01-02,-0.714319,-0.4254,-0.309822,5,2
2013-01-03,1.747487,0.488112,0.09436,5,3
2013-01-04,2.021094,-0.728123,-1.014076,5,4
2013-01-05,-0.278288,-0.327722,2.422124,5,5
2013-01-06,-0.361363,0.696996,-0.500446,5,6


In [44]:
df1 = df.copy()
df1[df1 < 0] = -df1
df1

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,0.937193,5,1
2013-01-02,0.714319,0.4254,0.309822,5,2
2013-01-03,1.747487,0.488112,0.09436,5,3
2013-01-04,2.021094,0.728123,1.014076,5,4
2013-01-05,0.278288,0.327722,2.422124,5,5
2013-01-06,0.361363,0.696996,0.500446,5,6


## Missing data

np.nan values are not included in computations by default

In [48]:
df

Unnamed: 0,A,B,C,D,F
2013-01-01,0.0,0.0,-0.937193,5,1
2013-01-02,-0.714319,-0.4254,-0.309822,5,2
2013-01-03,1.747487,0.488112,0.09436,5,3
2013-01-04,2.021094,-0.728123,-1.014076,5,4
2013-01-05,-0.278288,-0.327722,2.422124,5,5
2013-01-06,-0.361363,0.696996,-0.500446,5,6


In [52]:
df1 = df.reindex(index=dates[0:4], columns = list(df.columns) + ["E"])
df1.loc[dates[0]: dates[1], "E"] = 1

df1[0]['F']


KeyError: 0