# Pandas: A Python Library for Data Analysis

Notebook Author: Matthew Kearns

### The reference material for this notebook can be found in the pandas.pydata.org tutorial: http://pandas.pydata.org/pandas-docs/stable/10min.html

Notebook Contents:

    - Object creation
    - Viewing data
    - Selection
        - Getting
        - Selection by label
        - Selection by position
        - Boolean indexing
        - Setting
    - Missing data
    - Operations
        - Stats
        - Apply
        - Histogramming
        - String methods
    - Merge
        - Concat
        - Join
        - Append
    - Grouping
    - Reshaping
        - Stack
        - Pivot tables
    - Time series
    - Categoricals
    - Plotting
    - Getting data in/out
        - CSV
        - HDF5
        - Excel
    - Gotchas


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Object creation

In [2]:
# creating a Series by passing a list of values
squares = pd.Series([(x+1)**2 for x in range(10)])
squares

0      1
1      4
2      9
3     16
4     25
5     36
6     49
7     64
8     81
9    100
dtype: int64

In [3]:
# creating a data frame by passing a NumPy array and datetime index
dates = pd.date_range('20180101', periods=365)

In [4]:
dates

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08',
               '2018-01-09', '2018-01-10',
               ...
               '2018-12-22', '2018-12-23', '2018-12-24', '2018-12-25',
               '2018-12-26', '2018-12-27', '2018-12-28', '2018-12-29',
               '2018-12-30', '2018-12-31'],
              dtype='datetime64[ns]', length=365, freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(365, 5), index=dates, columns=list('ABCDE'))

In [7]:
df.head()

Unnamed: 0,A,B,C,D,E
2018-01-01,1.904261,0.668598,0.633009,0.804681,0.01053
2018-01-02,-0.136654,-0.405667,0.660185,-0.536618,-0.173108
2018-01-03,1.172189,0.819352,0.010252,0.973735,1.618968
2018-01-04,-0.290405,-0.484918,0.887774,0.685885,-0.046122
2018-01-05,0.591977,-0.935458,-0.772485,0.420939,1.194874


In [8]:
# creating a data frame with heterogeneous columns
df = pd.DataFrame({'A': pd.Timestamp('20180607'),
                   'B': np.array([1, 2, 3]), 
                   'C': pd.Categorical(['Monday', 'Tuesday', 'Wednesday']), 
                   'D': 'foo'})
df.dtypes

A    datetime64[ns]
B             int32
C          category
D            object
dtype: object

### Viewing data

In [17]:
# viewing the top and bottom rows of a data frame
dates = pd.date_range('20180101', periods=7)

df = pd.DataFrame(np.random.randn(7, 5), index=dates, columns=list('ABCDE'))

print('\ndf.head():\n', df.head())
print('\ndf.tail():\n', df.tail())


df.head():
                    A         B         C         D         E
2018-01-01  0.147916 -0.037493  0.060506 -0.381192 -1.888366
2018-01-02 -0.134965  0.029424 -0.954688  0.082766  0.693828
2018-01-03 -1.394146 -0.596004  1.962957 -0.149231 -0.046938
2018-01-04  0.815478  0.178928  0.018747  0.767455 -1.076740
2018-01-05 -0.398978  1.463726 -0.995175  0.165350 -2.179177

df.tail():
                    A         B         C         D         E
2018-01-03 -1.394146 -0.596004  1.962957 -0.149231 -0.046938
2018-01-04  0.815478  0.178928  0.018747  0.767455 -1.076740
2018-01-05 -0.398978  1.463726 -0.995175  0.165350 -2.179177
2018-01-06 -1.188608  0.401013  0.086184  1.060617 -1.700875
2018-01-07 -2.278527 -0.413172  0.803698  1.847506 -0.627094


In [18]:
# displaying the index, columns, and underlying NumPy data
df.index

DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
               '2018-01-05', '2018-01-06', '2018-01-07'],
              dtype='datetime64[ns]', freq='D')

In [19]:
df.columns

Index(['A', 'B', 'C', 'D', 'E'], dtype='object')

In [20]:
df.values

array([[ 0.1479163 , -0.03749284,  0.06050551, -0.38119195, -1.88836615],
       [-0.13496508,  0.02942439, -0.95468821,  0.0827655 ,  0.69382753],
       [-1.39414635, -0.59600413,  1.96295726, -0.14923061, -0.04693848],
       [ 0.81547785,  0.17892775,  0.01874695,  0.7674545 , -1.07674005],
       [-0.39897775,  1.46372571, -0.99517481,  0.16534961, -2.17917698],
       [-1.18860772,  0.40101304,  0.08618361,  1.06061689, -1.70087527],
       [-2.27852672, -0.41317181,  0.80369759,  1.84750646, -0.62709396]])

In [21]:
# quick statistics summary of the data
df.describe()

Unnamed: 0,A,B,C,D,E
count,7.0,7.0,7.0,7.0,7.0
mean,-0.633118,0.146632,0.140318,0.484753,-0.975052
std,1.049327,0.672435,1.022623,0.783586,1.047922
min,-2.278527,-0.596004,-0.995175,-0.381192,-2.179177
25%,-1.291377,-0.225332,-0.467971,-0.033233,-1.794621
50%,-0.398978,0.029424,0.060506,0.16535,-1.07674
75%,0.006476,0.28997,0.444941,0.914036,-0.337016
max,0.815478,1.463726,1.962957,1.847506,0.693828


In [22]:
# the transpose of the data
df.T

Unnamed: 0,2018-01-01 00:00:00,2018-01-02 00:00:00,2018-01-03 00:00:00,2018-01-04 00:00:00,2018-01-05 00:00:00,2018-01-06 00:00:00,2018-01-07 00:00:00
A,0.147916,-0.134965,-1.394146,0.815478,-0.398978,-1.188608,-2.278527
B,-0.037493,0.029424,-0.596004,0.178928,1.463726,0.401013,-0.413172
C,0.060506,-0.954688,1.962957,0.018747,-0.995175,0.086184,0.803698
D,-0.381192,0.082766,-0.149231,0.767455,0.16535,1.060617,1.847506
E,-1.888366,0.693828,-0.046938,-1.07674,-2.179177,-1.700875,-0.627094


In [23]:
# sorting by axis
df.sort_index(axis=1, ascending=False)

Unnamed: 0,E,D,C,B,A
2018-01-01,-1.888366,-0.381192,0.060506,-0.037493,0.147916
2018-01-02,0.693828,0.082766,-0.954688,0.029424,-0.134965
2018-01-03,-0.046938,-0.149231,1.962957,-0.596004,-1.394146
2018-01-04,-1.07674,0.767455,0.018747,0.178928,0.815478
2018-01-05,-2.179177,0.16535,-0.995175,1.463726,-0.398978
2018-01-06,-1.700875,1.060617,0.086184,0.401013,-1.188608
2018-01-07,-0.627094,1.847506,0.803698,-0.413172,-2.278527


In [24]:
# sorting by values
df.sort_values(by='A')

Unnamed: 0,A,B,C,D,E
2018-01-07,-2.278527,-0.413172,0.803698,1.847506,-0.627094
2018-01-03,-1.394146,-0.596004,1.962957,-0.149231,-0.046938
2018-01-06,-1.188608,0.401013,0.086184,1.060617,-1.700875
2018-01-05,-0.398978,1.463726,-0.995175,0.16535,-2.179177
2018-01-02,-0.134965,0.029424,-0.954688,0.082766,0.693828
2018-01-01,0.147916,-0.037493,0.060506,-0.381192,-1.888366
2018-01-04,0.815478,0.178928,0.018747,0.767455,-1.07674


### Selection

###### Getting