# 03.01 Introducing pandas objects

In [2]:
import pandas as pd 
import numpy as np

In numpy we've looked at array and ndarray as the multi-dimensional data structure; In Pandas, we'll have similar data structures: the Series, DataFrame and Index

In [9]:
data1= np.array([0.25,0.5,0.75, 1.0])
type(data1)
data1

array([0.25, 0.5 , 0.75, 1.  ])

In [10]:
data= pd.Series([0.25,0.5,0.75, 1.0])
type(data)
data #Series wraps both sequence of value and sequence of indices

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

In [11]:
data.values #values are simply a familiar array

array([0.25, 0.5 , 0.75, 1.  ])

The essential difference is the presence of the index: numpy array has implicitly defined integer index while pandas series has explicitly defined index associated with the values

### DataFrame as a generalized NumPy array

In [13]:
population_dict = {
                    'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135    
}

In [14]:
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

In [15]:
area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)

In [16]:
states= pd.DataFrame({'population': population,
                     'area':area})

In [17]:
states

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


### DataFrame as specialized dictionary

In [19]:
states['area']

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

In [20]:
ind = pd.Index([2, 3, 5, 7, 11])

# 03.02 Data Indexing and Selection

In [21]:
import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

In [22]:
data['b']

0.5

In [23]:
data['a':'c']

a    0.25
b    0.50
c    0.75
dtype: float64

In [25]:
data[0:3]

a    0.25
b    0.50
c    0.75
dtype: float64

In [28]:
data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
data

1    a
3    b
5    c
dtype: object

In [29]:
data.loc[1]

'a'

In [30]:
data.iloc[1]

'b'

In [31]:
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135


In [33]:
data.area is data['area']

True

In [36]:
data['density']= data['pop']/data['area']

In [38]:
data.loc[data['density']>100, ['pop','density']]

Unnamed: 0,pop,density
New York,19651127,139.076746
Florida,19552860,114.806121


# 03.03 Operation on data in pandas

In [3]:
rng = np.random.RandomState(42)

In [7]:
df = pd.DataFrame(rng.randint(0,10,(3,4)),
                 columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
0,7,2,5,4
1,1,7,5,1
2,4,0,9,5


In [8]:
#ufunc
np.sin(df*np.pi/4)

Unnamed: 0,A,B,C,D
0,-0.7071068,1.0,-0.707107,1.224647e-16
1,0.7071068,-0.707107,-0.707107,0.7071068
2,1.224647e-16,0.0,0.707107,-0.7071068


In [9]:
area = pd.Series({'Alaska': 1723337, 'Texas': 695662,
                  'California': 423967}, name='area')
population = pd.Series({'California': 38332521, 'Texas': 26448193,
                        'New York': 19651127}, name='population')

In [11]:
area.index & population.index

Index(['Texas', 'California'], dtype='object')

In [14]:
# index alignment in dataframe
A = pd.DataFrame(rng.randint(0, 20, (2, 2)),
                 columns=list('AB'))
A


B = pd.DataFrame(rng.randint(0, 10, (3, 3)),
                 columns=list('BAD'))
B

Unnamed: 0,B,A,D
0,6,7,2
1,0,3,1
2,7,3,1


In [15]:
A+B

Unnamed: 0,A,B,D
0,15.0,7.0,
1,22.0,14.0,
2,,,


In [16]:
fill = A.stack().mean()
A.add(B, fill_value=fill)

Unnamed: 0,A,B,D
0,15.0,7.0,12.5
1,22.0,14.0,11.5
2,13.5,17.5,11.5


In [31]:
#test if i can use ufunc to check data difference

C= pd.DataFrame(np.array([[1,2,3],[4,5,6],[7,8,9]]), columns=['A','B','C'])

D= pd.DataFrame(np.array([[1,2,3],[4,5,6],[8,8,9]]), columns=['A','B','C'])

E= pd.DataFrame(C-D)

(C-D).sum(axis=0)

A   -1
B    0
C    0
dtype: int64

In [32]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [34]:
vals1 = np.array([1, 'NaN', 3, 4])
vals1.sum()

TypeError: cannot perform reduce with flexible type

In [36]:
vals1 = np.array([1, np.nan, 3, 4])
vals1.sum()

TypeError: _sum() got an unexpected keyword argument 'skipna'

# 03.04 Null values

In [39]:
data = pd.Series([1, np.nan, 'hello', None])

In [42]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [43]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [44]:
data.dropna() #drop all rows which any null value is present

0        1
2    hello
dtype: object

In [45]:
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [47]:
data.fillna(method='ffill')

0        1
1        1
2    hello
3    hello
dtype: object

# 03.05 Hierarchical Indexing

While pandas does provide Panel and Panel4D objects that natively handle three dimensional and four dimensional data, a far more common pattern in practice is to make sue of hierarchical indexing to incorporate multiple index levels with a single index

In [56]:
index = [('California', 2000), ('California', 2010),
         ('New York', 2000), ('New York', 2010),
         ('Texas', 2000), ('Texas', 2010)]
populations = [33871648, 37253956,
               18976457, 19378102,
               20851820, 25145561]
#tuple based indexing is essentially a rudimentary multi index, MultiIndex 
#contains multiple levels of indexing
pop = pd.Series(populations, index=index)
index = pd.MultiIndex.from_tuples(index, names=('state','year'))
index

MultiIndex([('California', 2000),
            ('California', 2010),
            (  'New York', 2000),
            (  'New York', 2010),
            (     'Texas', 2000),
            (     'Texas', 2010)],
           names=['state', 'year'])

In [57]:
pop=pop.reindex(index)

In [64]:
pop[:,2010]

state
California    37253956
New York      19378102
Texas         25145561
dtype: int64

In [65]:
pop_df = pop.unstack()

state       year
California  2000    33871648
            2010    37253956
New York    2000    18976457
            2010    19378102
Texas       2000    20851820
            2010    25145561
dtype: int64

In [69]:
import os
data_path='C:/Users/miaoxi/Documents/LearnPython/PythonDataScienceHandbook-master/PythonDataScienceHandbook-master/notebooks/data'

population=pd.read_csv(os.path.join(data_path,'state-population.csv'))

In [70]:
population

Unnamed: 0,state/region,ages,year,population
0,AL,under18,2012,1117489.0
1,AL,total,2012,4817528.0
2,AL,under18,2010,1130966.0
3,AL,total,2010,4785570.0
4,AL,under18,2011,1125763.0
...,...,...,...,...
2539,USA,total,2010,309326295.0
2540,USA,under18,2011,73902222.0
2541,USA,total,2011,311582564.0
2542,USA,under18,2012,73708179.0


In [81]:
population.loc[population['state/region']=='USA']
#same as 
population.loc[population['state/region']=='USA',]
#same as
population.loc[population['state/region']=='USA',:]

Unnamed: 0,state/region,ages,year,population
2496,USA,under18,1990,64218512.0
2497,USA,total,1990,249622814.0
2498,USA,total,1991,252980942.0
2499,USA,under18,1991,65313018.0
2500,USA,under18,1992,66509177.0
2501,USA,total,1992,256514231.0
2502,USA,total,1993,259918595.0
2503,USA,under18,1993,67594938.0
2504,USA,under18,1994,68640936.0
2505,USA,total,1994,263125826.0


In [93]:
population1= population.set_index(['state/region','year']).sort_index(1,2)
population1

Unnamed: 0_level_0,Unnamed: 1_level_0,ages,population
state/region,year,Unnamed: 2_level_1,Unnamed: 3_level_1
AL,2012,under18,1117489.0
AL,2012,total,4817528.0
AL,2010,under18,1130966.0
AL,2010,total,4785570.0
AL,2011,under18,1125763.0
...,...,...,...
USA,2010,total,309326295.0
USA,2011,under18,73902222.0
USA,2011,total,311582564.0
USA,2012,under18,73708179.0


In [99]:
population.loc[(population['state/region']=='USA')&
               (population['year']==2012),]

Unnamed: 0,state/region,ages,year,population
2542,USA,under18,2012,73708179.0
2543,USA,total,2012,313873685.0


In [109]:
population1.loc['USA'].unstack()

            year
ages        1990        under18
            1990          total
            1991          total
            1991        under18
            1992        under18
                       ...     
population  2010    3.09326e+08
            2011    7.39022e+07
            2011    3.11583e+08
            2012    7.37082e+07
            2012    3.13874e+08
Length: 96, dtype: object

In [115]:
population1['2010']

KeyError: '2010'

In [107]:
population.loc[(population['state/region']=='USA')&
               (population.year==2012),]

Unnamed: 0,state/region,ages,year,population
2542,USA,under18,2012,73708179.0
2543,USA,total,2012,313873685.0
