## Using Pandas DataFrame Structures

In [48]:
import numpy as np
import pandas as pd
from pandas import DataFrame

In [49]:
# a dictionary of values
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}

# we can make this into a DataFrame
df = pd.DataFrame(data)
df.head(3)
df.tail(2)
df.count()
df.describe()

Unnamed: 0,year,pop
count,6.0,6.0
mean,2001.5,2.55
std,1.048809,0.836062
min,2000.0,1.5
25%,2001.0,1.875
50%,2001.5,2.65
75%,2002.0,3.125
max,2003.0,3.6


In [50]:
# we can deal with members of the data frame
df2 = DataFrame(data, columns=['year', 'pop', 'state', 'debt'],
               index=['oldest', 'old', 'middle', 'recent', 'penultimate', 'latest'])
df2.year # this returns a Series from the column
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,
old,2001,1.7,Ohio,
middle,2002,3.6,Ohio,
recent,2001,2.4,Nevada,
penultimate,2002,2.9,Nevada,
latest,2003,3.2,Nevada,


### Accessing members of a DataFrame

In [51]:
df2.loc['middle'] # loc returns the location by index value

year     2002
pop       3.6
state    Ohio
debt      NaN
Name: middle, dtype: object

In [52]:
# we can populate rows
df2.debt = 26.5
df2.debt = df2['pop']*1000
df2.debt = np.arange(6.) # we have floats as the data type
df2

Unnamed: 0,year,pop,state,debt
oldest,2000,1.5,Ohio,0.0
old,2001,1.7,Ohio,1.0
middle,2002,3.6,Ohio,2.0
recent,2001,2.4,Nevada,3.0
penultimate,2002,2.9,Nevada,4.0
latest,2003,3.2,Nevada,5.0


In [53]:
# we can selectively populate values
vals = pd.Series([-1.2, -1.5, -1.7], index=['oldest', 'latest', 'recent'])
vals
# NB here we inject the WHOLE debt columns - not just some values
df2.debt = vals # selectively inject values into the debt column
df2['debt'] # CAUTION - by selectively injecting, the OTHER values have been lost


oldest        -1.2
old            NaN
middle         NaN
recent        -1.7
penultimate    NaN
latest        -1.5
Name: debt, dtype: float64

In [54]:
# we can insert additional columns for data
df2['Eastern'] = df2.state == 'Ohio'
df2
# mini-challenge: derive the debt per population (where it is a number!)
# results in a new column called 'ratio'
df2['ratio'] = df2.debt/df2['pop'] # careful - pop is a method!! ['pop']
df2

Unnamed: 0,year,pop,state,debt,Eastern,ratio
oldest,2000,1.5,Ohio,-1.2,True,-0.8
old,2001,1.7,Ohio,,True,
middle,2002,3.6,Ohio,,True,
recent,2001,2.4,Nevada,-1.7,False,-0.708333
penultimate,2002,2.9,Nevada,,False,
latest,2003,3.2,Nevada,-1.5,False,-0.46875


In [62]:
# we can spot values and replace with other values
df2 = df2.fillna(0) # careful the changes will not persist 
                    # unless we capture them in a DataFrame
df2

Unnamed: 0,year,pop,state,debt,Eastern,ratio
oldest,2000,1.5,Ohio,-1.2,True,-0.8
old,2001,1.7,Ohio,0.0,True,0.0
middle,2002,3.6,Ohio,0.0,True,0.0
recent,2001,2.4,Nevada,-1.7,False,-0.708333
penultimate,2002,2.9,Nevada,0.0,False,0.0
latest,2003,3.2,Nevada,-1.5,False,-0.46875


In [65]:
# we can transpose a DataFrame
df2.T # not persistent

Unnamed: 0,oldest,old,middle,recent,penultimate,latest
year,2000,2001,2002,2001,2002,2003
pop,1.5,1.7,3.6,2.4,2.9,3.2
state,Ohio,Ohio,Ohio,Nevada,Nevada,Nevada
debt,-1.2,0.0,0.0,-1.7,0.0,-1.5
Eastern,True,True,True,False,False,False
ratio,-0.8,0.0,0.0,-0.708333,0.0,-0.46875
