# Using Data Packages with Pandas
See the [tableschema-pandas-py](https://github.com/frictionlessdata/tableschema-pandas-py) repository for more information.

In [1]:
from tableschema import Storage
from datapackage import Package
import pandas as pd

In [2]:
# load resources from a data package as Pandas data frames by using datapackage.push_datapackage function:
url = 'https://raw.githubusercontent.com/frictionlessdata/example-data-packages/master/cpi/datapackage.json'

storage = Storage.connect('pandas')
package = Package(url)
package.save(storage=storage)

True

In [3]:
# Storage works as a container for Pandas data frames
# learn more about storage here: https://github.com/frictionlessdata/tableschema-py#storage
storage.buckets

['cpi']

In [4]:
type(storage['cpi'])

pandas.core.frame.DataFrame

In [5]:
# you can now use Pandas functions to work with your data package-turned-data frame
storage['cpi'].head()

Unnamed: 0,Country Name,Country Code,Year,CPI
0,Afghanistan,AFG,2004,63.131893
1,Afghanistan,AFG,2005,71.140974
2,Afghanistan,AFG,2006,76.302178
3,Afghanistan,AFG,2007,82.774807
4,Afghanistan,AFG,2008,108.0666


In [6]:
# for example, let's look at the first value in the 'Year' column
storage['cpi']['Year'][0]

2004

In [7]:
# We can get the data types per column
storage['cpi'].dtypes

Country Name     object
Country Code     object
Year              int64
CPI             float64
dtype: object

In [8]:
# we can get the column names
storage['cpi'].columns

Index(['Country Name', 'Country Code', 'Year', 'CPI'], dtype='object')

In [9]:
# we can sort the data
storage['cpi'].sort_index(axis=0, ascending=False)

Unnamed: 0,Country Name,Country Code,Year,CPI
6935,Zambia,ZMB,2014,130.821971
6934,Zambia,ZMB,2013,121.342732
6933,Zambia,ZMB,2012,113.428087
6932,Zambia,ZMB,2011,106.429397
6931,Zambia,ZMB,2010,100.000000
...,...,...,...,...
4,Afghanistan,AFG,2008,108.066600
3,Afghanistan,AFG,2007,82.774807
2,Afghanistan,AFG,2006,76.302178
1,Afghanistan,AFG,2005,71.140974


In [10]:
# we can set new values
storage['cpi'].at[0,'Year'] = 2002

In [11]:
# viewing the dataframe

storage['cpi']

Unnamed: 0,Country Name,Country Code,Year,CPI
0,Afghanistan,AFG,2002,63.131893
1,Afghanistan,AFG,2005,71.140974
2,Afghanistan,AFG,2006,76.302178
3,Afghanistan,AFG,2007,82.774807
4,Afghanistan,AFG,2008,108.066600
...,...,...,...,...
6931,Zambia,ZMB,2010,100.000000
6932,Zambia,ZMB,2011,106.429397
6933,Zambia,ZMB,2012,113.428087
6934,Zambia,ZMB,2013,121.342732


In [12]:
# now let's pull this data frame back into a data package:

# note, you can use a local file instead of a URL here
packageURL = 'https://raw.githubusercontent.com/frictionlessdata/examples/master/cpi/datapackage.json'

In [13]:
newDatapackage = Package(packageURL, 'country_list', 'pandas', tables={
...     'data': storage['cpi'],
... })

In [14]:
newDatapackage.descriptor['resources']

[{'path': 'data/cpi.csv',
  'name': 'cpi',
  'profile': 'tabular-data-resource',
  'schema': {'fields': [{'name': 'Country Name',
     'type': 'string',
     'format': 'default'},
    {'name': 'Country Code', 'type': 'string', 'format': 'default'},
    {'name': 'Year', 'type': 'year', 'format': 'default'},
    {'name': 'CPI',
     'description': 'CPI (where 2005=100)',
     'type': 'number',
     'format': 'default'}],
   'missingValues': ['']}}]