In [1]:
import pandas as pd
import numpy as np

"""
Begun 8/30/2016
Python for Data Analysis - Chapter 7 - Data Wrangling: Clean, Transform, Merge, Reshape

Things to review:

---
Things learned here:


pd.merge(df1, df2) - like a SQL Join - Best used for columns
    Examples
        pd.merge(df1, df2, on='key')
        pdf.merge(df1, df2, left_on = 'left_df1_key', right_on = 'right_df2_key')
        pdf.merge(df1, df2, how='outer')
    Keys can be lists of column names for joining on multiple columns
    When joining column-by-column, indexes on the passed DataFrame objects are discarded
    ** See Table 7-1 for the merge() function arguments.  
        suffixes for matching column names = ('_left_suffix', '_right_suff')  # A tuple, not a list
        Index Merge - right_index = True or left_index = True if the rigth or left df should 

    MultiIndex merge - Pass a list of columns or specify right_index and left_index

Joining on Indexes - Better done with a Join
    left_df.join(right_df, on='key')
        # Joins are always left joins
    left_df.join([right_df, another_df])  
        # A simple way to do index-on-index merges
    Join supports most of the parameters of merge

pd.concat - like a SQL Union - "stacks together objects along an axis"
    pd.concat([df1, df2], axis=0, keys=[...])
    Pased parameters can be series or dataframe
        axis = 0 v. axis = 1
        inner
        outer
        more than 2 DFs
        names
    Create a hierarchical index from concatenated data
    ** See Table 7-2 for the concat() function arguments.  

combine_first - "enables splicing together overlapping data to fill in missing values in one object with values from another."
    Like a "coalesce" to pull the first non-null value at that index for the provided column of the series of DF


Reshaping and pivoting
    stack - columns --> rows
    unstack - rows --> columns
        Think alphabetical order.  [stack / unstack] --> [col-->row / row --> col]
        
    df.stack() - move innermost cols to innermost rows.  "make taller not wider"
    df.unstack() - move innermost rows to innermost cols.  "make wider not taller"
        - Unstack stakes a union of all of the values in the row it's moving and makes them cols
        - Some other row levels may be missing those inner row keys, in which case they get NaN in the resulting (row, col) cell
        
    NaN & Stack
        - When df.stack() will result in NaNs, they will automatically be dropped by default
        - df.stack(dropna=False) will preserve them in the taller DF
    
    df.stack(level='level_name') or df.unstack(level='level_name') will perform the same operation on a specific level
    
    Also works with series equivalently.

Pivoting 
    Pivot is a shortcut for creating a hierarchical index and then reshaping
    It turns "long" data into "wide" data.
    df.set_index(['col1','col2']).unstack('col2')['value'] === df.pivot('col1','col2','value')
        # The third 'value' parameter is optional
    
Data Cleaning and Transformation
    Series.map(obj)
    Series.map(lambda x: obj(x))
    


"""

# Important snippets:

# Concat with DataFrames -- pass objects in a dict
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5+ np.arange(4).reshape(2,2), index=['a','c'], columns=['three','four'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'],names=['upper', 'lower'])

upper,level1,level1,level2,level2
lower,one,two,three,four
a,0,1,5.0,6.0
b,2,3,,
c,4,5,7.0,8.0


In [None]:
# Merge

df1 = pd.DataFrame({'key': ['b','b', 'a', 'c', 'a', 'a', 'b'],
                   'data1': range(7)})
df1

In [None]:
df2 = pd.DataFrame({'key': ['a', 'b', 'd'],
                   'data2': range(3)})
df2

In [None]:
# When not specified explicitly, pd.merge() uses overlapping column names as keys
pd.merge(df1, df2)
    # merged on "key" column
    # Rows with keys c and d are missing from the merge because they are not keys in both dataframes

In [None]:
# Specifying explicitly is good practice and will yield identical results
pd.merge(df1, df2, on='key')

In [None]:
# No shared key: state explicitly the keys from each DataFrame
df3 = pd.DataFrame({'lkey':['b','b','a','c','a','a','b'],
                    'data1': range(7)})
df4 = pd.DataFrame({'rkey': ['a', 'b', 'd'],
                  'data2': range(3)})
pd.merge(df3, df4, left_on='lkey', right_on='rkey')
    # Be default this is an inner join
    # Rows with keys c and d are missing from the merge because they are not keys in both dataframes

In [None]:
# Outer Join: Equivalent to combining both left and right joins
pd.merge(df1, df2, how='outer')

In [None]:
# Many-to-many merges form a Cartesian Product of the specified keys
df1 = pd.DataFrame({'key': ['b', 'b', 'a', 'c' ,'a', 'b'],
                   'data1': range(6)})
df2 = pd.DataFrame({'key': ['a', 'b', 'a', 'b', 'd'],
                   'data2': range(5)})
pd.merge(df1, df2, on='key', how='outer')
    # Outer join preserves the row when the key is missing from either one

In [None]:
# Many-to-many inner join requires all keys be present -- it is still a Cartesian product
pd.merge(df1, df2, on='key', how='inner')

In [None]:
# Joining on Multiple Column Names
left = pd.DataFrame({'key1': ['foo', 'foo', 'bar'],
                    'key2': ['one', 'two', 'one'],
                    'lval': [1, 2, 3]})
right = pd.DataFrame({'key1': ['foo', 'foo', 'bar', 'bar'],
                     'key2': ['one', 'one', 'one', 'two'],
                     'rval': [4,5,6,7]})
pd.merge(left, right, on=['key1', 'key2'], how = 'outer')

In [None]:
## Merging on Index - Inner Join (Default)
left1 = pd.DataFrame({'key': ['a', 'b', 'a', 'a', 'b', 'c'],
                     'value': range(6)})
right1 = pd.DataFrame({'group_val': [3.5, 7]}, index=['a', 'b'])
pd.merge(left1, right1, left_on='key', right_index=True)
    # specify the left_on key column and then right_index=True to use 'key' on left and the index on the right

In [None]:
# Merging on Index - Outer Join
pd.merge(left1, right1, left_on='key', right_index=True, how='outer')

In [None]:
# Merging on Index - MultiIndex
left_mi = pd.DataFrame({'key1': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada'],
                        'key2': [2000, 2001, 2002, 2001, 2002],
                        'data': np.arange(5.)})
right_mi = pd.DataFrame(np.arange(12).reshape((6, 2)),
                       index = [['Nevada', 'Nevada', 'Ohio', 'Ohio', 'Ohio', 'Ohio'],
                               [2001, 2000, 2000, 2000, 2001, 2002]],
                       columns=['event1', 'event2'])
left_mi

In [None]:
right_mi

In [None]:
# Multiple columns have to be passed as a list
pd.merge(left_mi, right_mi, left_on=['key1', 'key2'], right_index=True)

# Note that the new dataframe uses the index of the left dataframe

In [None]:
# Merging on the Multi Index of both df requires nothing special - just set right_index=True and left_index=True
left2 = pd.DataFrame([[1., 2.], [3., 4.], [5., 6.]], index=['a', 'c', 'e'], columns=['Ohio', 'Nevada'])
right2 = pd.DataFrame([[7., 8.], [9., 10.], [11., 12.], [13, 14]], index=['b', 'c', 'd', 'e'], columns=['Missouri', 'Alabama'])
pd.merge(left2, right2, left_index=True, right_index=True, how='outer')

In [None]:
## Using Join instead of Merge for Index merges
left2.join(right2, how='outer')

In [None]:
## Can also join the column of the calling DF to an index of the passed DF
left1.join(right1, on='key')
    # key is a column in left1 and its values are also in the index of right

In [None]:
# For simple index-on-index merges, join can be fast & simple
another = pd.DataFrame([[7., 8], [9., 10], [11., 12], [16., 17.]],
                      index=['a', 'c', 'e', 'f'], columns=['New York', 'Oregon'])
another

In [None]:
left2.join(right2)

In [None]:
# Joining on multiple data frames requires putting the inner data frames in a list
left2.join([right2, another])

In [None]:
left2.join([right2, another], how='outer')

In [None]:
### Concatenation -- a.k.a. Stacking or Binding
arr = np.arange(12).reshape((3, 4))
arr

In [None]:
np.concatenate([arr, arr])
    # By default the appending happens along axis = 0
    # I.e. the append applies on rows.

In [None]:
np.concatenate([arr, arr], axis=1)
    # appending along axis = 1, i.e. the append applies on columns.

In [None]:
# Concatenation of Series - No matching index values
s1 = pd.Series([0, 1], index=['a', 'b'])
s2 = pd.Series([2, 3, 4], index=['c', 'd', 'e'])
s3 = pd.Series([5, 6], index=['f', 'g'])
pd.concat([s1, s2, s3])
    # Along axis=0 by default

In [None]:
# Cocnatenation of Multiple Series on axis=1
pd.concat([s1, s2, s3], axis=1)
    # Appended as columns.  Indexes provided don't match so need to append new index as well.  Gives Waterfall appearance.

In [None]:
# Concatenation along axes - More examples
s4 = pd.concat([s1 * 5, s3])
s4

In [None]:
# Concatenation with some index values overlapping - Default 
pd.concat([s1, s4], axis=1)
    # Concatenation by default will preserve all the values in the source DataFrames - creates a union of the Indexes
    # Does NOT create a cartesian product but instead lines up matching indices like an left join

In [None]:
# Concatenation with some index values overlapping - Inner
pd.concat([s1, s4], axis=1, join='inner')
    # Specifying inner will require that all indexes are matching

In [None]:
# Concatenation using specified axes - the specified axes don't have to exist
pd.concat([s1, s4], axis=1, join_axes=[['a', 'c', 'b', 'e']])

In [None]:
# Create a hierarchical index (MultiIndex) that includes a concatenation
result = pd.concat([s1, s1, s3], keys=['one', 'two', 'three'])
result
    # Specifies an outer index for each series passed.

In [None]:
### Stacking, Unstacking
# Stacking: Col Index --> Row Index (Stacking comes first alphabetically and Cols --> Rows is alphabetical)
# Unstacking: Row Index --> Col Index (Unstacking comes next alphabetically and Rows --> Cols is anti-alphabetical)

# Move the innermost index to columns using series.unstack()
result.unstack()

In [None]:
# When combining along axis=1, the keys become df column headers
pd.concat([s1, s2, s3], axis=1, keys=['one', 'two', 'three'])

In [None]:
# You can do the same thing with DataFrames -- pass objects in a dict
df1 = pd.DataFrame(np.arange(6).reshape(3,2), index=['a', 'b', 'c'], columns=['one', 'two'])
df2 = pd.DataFrame(5+ np.arange(4).reshape(2,2), index=['a','c'], columns=['three','four'])
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'])

In [None]:
# Passing a dict with keys results in the same output
pd.concat({'level1':df1, 'level2': df2}, axis=1)

In [None]:
# Names can specify axis names
pd.concat([df1, df2], axis=1, keys=['level1', 'level2'], names=['upper', 'lower'])

In [None]:
# If we want to concat while ignoring indices then we can specify that and the indices get reset
df1 = pd.DataFrame(np.random.randn(3,4), columns=['a', 'b', 'c', 'd'])
df2 = pd.DataFrame(np.random.randn(2,3), columns=['b','d','a'])
pd.concat([df1, df2], ignore_index=True)
    # Otherwise indexes would stay as their original values (0,1,2) and (0,1)

In [None]:
# combine_first -- Combining data frames index-by-index with "coalesce"-style operation
a = pd.Series([np.nan, 2.5, np.nan, 3.5, 4.5, np.nan],
                index=['f','e','d','c','b','a'])
a

In [None]:
b = pd.Series(np.arange(len(a), dtype =np.float64),
              index=['f','e','d','c','b','a'])
b[-1] = np.nan
b

In [None]:
# combine_first Example - Series
a.combine_first(b)

In [None]:
# combine_first Example - DataFrame
df1 = pd.DataFrame({'a':[1., np.nan, 5., np.nan]
                    ,'b':[np.nan, 2., np.nan, 6.]
                    ,'c': range(2, 18, 4)})
df2 = pd.DataFrame({'a':[5., 4., np.nan, 3., 7.]
                   , 'b':[np.nan, 3., 4., 6., 8.]}
                  )
# Indexes are 0, 1, 2, 3
df1.combine_first(df2)
    # 

In [None]:
##### Reshape and Pivot Operations
data = pd.DataFrame(np.arange(6).reshape(2,3),
             index=pd.Index(['Ohio','Colorado'], name='state'),
            columns=pd.Index(['one','two','three'], name='number'))
data

In [None]:
pd.DataFrame({'left': result, 'right': result+5}, columns=pd.Index(['left','right']))
#df = pd.DataFrame({'left': result, 'right': result+5}, columns=['left','right'])
    # The commented out line, without pd.Index in the column name is equivalent

In [None]:
# df.stack() -- pivots the columns into rows.  "stack" - "make it taller not wider"
result = data.stack()
result

In [None]:
# series.stack() -- pivots the most inside row index into the columns.  It is the inverse operation to unstack(). 
# "unstack" = "make wider not taller"
result.unstack()

In [None]:
# stack(level='level_name') will stack a different level than the most inside level
result.unstack(level='state')

In [None]:
s1 = pd.Series([0, 1, 2, 3], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 5, 6], index=['c', 'd', 'e'])
data2 = pd.concat([s1, s2], keys=['one','two'])
    # Stacks s1 and s2 on top with new keys 'one' for s1 data and 'two' for s2 data
data2

In [None]:
data2.unstack()
    # Takes a union
    # Note that it turned the dtype into a float!

In [None]:
# df.stack() automatically drops nan's by Default
data2.unstack().stack()

In [None]:
# df.stack(dropna=False) will keep them
data2.unstack().stack(dropna=False)

In [None]:
#### Long and Wide formats

# Data is imported in a "wide" format
data = pd.read_csv('ch07/macrodata.csv')
periods = pd.PeriodIndex(year = data.year, quarter = data.quarter, name='date')
data = pd.DataFrame(data.to_records(), columns=pd.Index(['realgdp', 'infl', 'unemp'], name='item'), index=periods.to_timestamp('D','end'))
data
# The transformations keep it in that wide format, just take a subset of the columns and creates a "date" column as an index.

In [None]:
# Turn it into a stacked format with a (date, item, value) tuple per row
# That makes this more flexible as the item types change, but harder to sum across rows
ldata = data.stack().reset_index().rename(columns={0: 'value'})
ldata[:10]

In [None]:
# Turn Long data into Wide data with df.pivot()
pivoted = ldata.pivot('date', 'item', 'value')
#pivoted = ldata.pivot('date', 'item')['value']
pivoted.head()

# pivot(first, second, value)
# --> distinct values in first is used as rows
# --> distinct values in second is used as columns
# --> if "value" is passed then it is equivalent to ldata.pivot('data','item')['value]
# Leaving it out puts "value" as a hierarchical index on top of the item columns

In [None]:
# Pivot with 2 data columns makes it obvious we're creating a hierarchical index
ldata['value2'] = np.random.randn(len(ldata))
ldata[:10]
pivoted = ldata.pivot('date','item')
pivoted.head()

# Leaving out value and value2 leaves them as hierarchical indexes above the item columns

In [None]:
# Adding the 3rd criteria drills down on a particular element of the hierarchical index
pivoted = ldata.pivot('date','item','value')
pivoted.head()


In [None]:
# As you may have guessed, pivot is just a shortcut for using set_index and reshaping with unstack
ldata.set_index(['date','item']).unstack('item')['value'].head()
# Take the DF, set a new multi-index, take the inner value of that axis=0 index and put it as the inner value of the axis=1 index

In [None]:
### Data Filtering, Cleaning, and Transformations

data = pd.DataFrame({'k1': ['one'] * 3 + ['two'] * 4, 'k2': [1, 1, 2, 3, 3, 4, 4]})
data

In [None]:
# df.duplicated() returns whether each row is a duplicate of another row higher up
# It does not include the first row found that matches a row farther down the chain
data.duplicated()

In [None]:
# Drop duplicates using the usual Boolean techniques or by using the data.drop_duplicates() function
data[data.duplicated()==False], data.drop_duplicates()
# Identical output!

In [None]:
# Be default, duplicates() considers a row a duplicate only if all column values in that row match
# Filter for duplicates in a single column only:

data['v1']=range(7)
data.drop_duplicates(['k1']) # Filter on the k1 column only

In [None]:
# df.drop_duplicates(['k1'], keep = 'last') forces drop_duplicates to keep the last value instead of the first value
data.drop_duplicates(['k1','k2'], keep = 'last')
    # This is relevant because additional columns may be sorted, so we might want the first or last value in the other column

In [None]:
## Mapping -- perform some operation based on the values in an array, Series, or column.

data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon', 'Pastrami', 'corned beef', 'Bacon', 'pastrami', 'honey ham', 'nova lox'], 
                    'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})
data

In [None]:
# Map for element-wise transformations and data cleaning -- e.g. Add a column
meat_to_animal = {'bacon': 'pig', 'pulled pork': 'pig', 'bacon': 'pig', 'pastrami': 'cow', 'corned beef': 'cow', 'honey ham': 'pig', 'nova lox': 'salmon'}
meat_to_animal
data['animal'] = data['food'].map(str.lower).map(meat_to_animal)
data

In [None]:
# A lambda is equivalent to passing the function name itself in a map
# Notice the lack of () in the functions in the cell above: is because you're just passing the function object, not calling the function
# With Lambda, you're explicitly calling the function row by row; with just map(obj) you're passing the object to map,
# which could be a function or could be another type of object.
# Passing just a dict
data['food'].map(lambda x: meat_to_animal[x.lower()])

In [None]:
# Replacing Values in a Series
data = pd.Series([1., -999., 2., -999., -1000, 3.])
data.replace(-999, np.nan) # Replace -999 with np.nan
data.replace([-999, -1000], np.nan) # Replace different values with np.nan
data.replace([-999, -1000], [np.nan, 0]) # Use different replacement values
data.replace([-999: np.nan, -1000, 0]) # pass multiple arguments that are 1-1 as a dict if you want