# Pandas `merge`, `join`, `concatenate`, and `reshaping` <a class="tocSkip">

In [None]:
from pandas import DataFrame
from pandas import Series
import pandas as pd
from numpy.random import randint
from numpy.random import randn
from numpy.random import randint
import numpy as np

## Merging Data

This is basically a join operation on Pandas DataFrames.

- The working DataFrame

In [None]:
df1 = DataFrame({'key1': list('bacaab'), 
                 'data1': randint(1, 11, 6)})
print(df1)
df2 = DataFrame({'key1': list('acd'), 
                 'data2': randint(1, 11, 3)})
print(df2)

- `merge` with inner join

In [None]:
df_inner1 = pd.merge(df1, df2, on = 'key1', how = 'inner')
df_inner1

- `merge` with outer join

In [None]:
df_outer1 = pd.merge(df1, df2, on = 'key1', how = 'outer')
df_outer1

- `merge` with left join

In [None]:
df_left1 = pd.merge(df1, df2, on = 'key1', how = 'left')
df_left1

- `merge` with right join

In [None]:
df_right1 = pd.merge(df1, df2, on = 'key1', how = 'right')
df_right1

- `merge` with multiple keys

- You may want to change the names of the keys 

In [None]:
df4 = DataFrame({'key4': list('bacaab'), 
                 'data4': randint(1, 11, 6)})
print(df1)
df5 = DataFrame({'key5': list('acd'), 
                 'data5': randint(1, 11, 3)})
print(df2)

In [None]:
df6 = pd.merge(df4, df5, left_on = 'key4', right_on = 'key5', how = 'outer')
print(df6)

- Many to many join
    - Many to many means the same key multiple times on both DataFrames
    - This creates a Cartesian product
   

In [None]:
df7 = DataFrame({'key1': list('baccaab'), 'data1': randint(1, 11, 7)})
print(df7)
df8 = DataFrame({'key1': list('acdac'), 'data2': randint(1, 11, 5)})
print(df8)

In [None]:
df9 = pd.merge(df7, df8, on = 'key1', how = 'outer')
print(df9)
print('\nTo see non-duplicates')
print(df9.drop_duplicates())

- **NOTICE:** There are six entries for `a` and four for `c` just as there should be for a Cartesian product

## Merging on Index

- The index can be used as the key by adding `left_index` and `right_index`

In [None]:
df10 = DataFrame({'key1': list('baccaab'), 'data1': randint(1, 11, 7)})
print(df10)
df11 = DataFrame({'group_data': randint(1, 11, 2)}, index = list('ab'))
print(df11)

In [None]:
df12 = pd.merge(df10, df11, left_on = 'key1', right_index = True)
print(df12)

### Concatenating Along an Axis

#### Series No Overlap

In [None]:
s1 = Series([1, 2], index = list('ab'))
s2 = Series([3, 4, 5], index = list('cde'))
s3 = Series([6, 7], index = list('fg'))

s4 = pd.concat([s1, s2, s3], axis = 0)

print(s1, s2, s3, s4, sep = '\n')

In [None]:
df13 = pd.concat([s1, s2, s3], axis = 1, join = 'outer')
print(df13)

- With `axis = 1` the concatenation is down the columns
- The Series are converted to (1,x) DataFrames 
- The DataFrames are expanded to have the same column with NaN for missing data
- The DataFrames are concatenated
- The `join=outer` is the default

#### Specifying Column Headers

In [None]:
df14 = pd.concat([s1,s2,s3], axis = 1, join = 'outer', keys = ['one', 'two', 'three'])
print(df14)

#### Series Inner Join

In [None]:
s4 = pd.concat([s1, s3])

In [None]:
print(s4)

In [None]:
print(s1)

In [None]:
df15 = pd.concat([s4, s1], axis = 1, join = 'inner')
print(df15)

- `left` and `right` are not viable in `concat` for Series
- You used to be able to do this with `join_axes`
- Below is a "left" join
```
df16 = pd.concat([s4, s1], axis = 1, join_axes = [['a', 'b', 'f', 'g']])
print(df16)
```
- However, `join_axes` is now deprecated. Use `.reindex()` or `.reindex_like()` instead.

In [None]:
s5 = s1.reindex_like(s4)
print(s5)
df16 = pd.concat([s4, s5], axis = 1, join = 'inner')
print(df16)

#### DataFrame Example

In [None]:
df17 = DataFrame(randint(1, 11,(3, 2)), index = list('abc'), columns = ['one', 'two'])
print(df17)
df18 = DataFrame(randint(1, 11,(2, 2)), index = list('ac'), columns = ['three', 'four'])
print(df18)
df19 = pd.concat([df17, df18], axis = 1)
print(df19)

In [None]:
df20 = pd.concat([df17, df18], axis = 0)
print(df20)

## Data Transformation

### Removing Duplicates

In [None]:
df9 = pd.merge(df7, df8, on = 'key1', how = 'outer')
print(df9)

In [None]:
print('\nTo see non-duplicates')
print(df9.drop_duplicates())

### Using a Function Mapping

- Transform DataFrame, Series based on data in DataFrame, Series and outside data

In [None]:
food_intake = DataFrame({'food': ['bacon', 'Bacon', 'pastrami', 'Pastrami'], 
                         'ounces': [2, 4, 3, 6]})
print(food_intake)

In [None]:
#animal_meat = {'bacon': 'pig', 'pastrami':'cow'}
animal_meat = {'pastrami': 'cow', 'bacon': 'pig'}
print(animal_meat)

#### Method One

In [None]:
print(food_intake['food'].map(str.lower))

In [None]:
print(food_intake['food'].map(str.lower).map(animal_meat))

In [None]:
food_intake['animal'] = food_intake['food'].map(str.lower).map(animal_meat)
print(food_intake)

#### Method Two with Lambda

In [None]:
food_intake['animal'] = food_intake['food'].map(lambda x: animal_meat[x.lower()])
print(food_intake)

### Replacing Values

In [None]:
df21 = DataFrame({'A': [np.nan, 10, 20, np.nan], 'B': [1, 2, np.nan, 4]},)
print(df21)

In [None]:
df22 = df21.replace(np.nan, 0)
print(df22)

### Renaming Axis Indexes

In [None]:
columns = [ x for x in 'dollars pounds lek guilder peso'.split()]
columns

In [None]:
df23 = DataFrame(randint(1, 10, (4, 5)), index = 'one two three four'.split(), 
                  columns = 'dollars pounds lek guilder peso'.split())
print(df23)

In [None]:
df24 = df23.index.map(str.upper)
print(df24)

In [None]:
df25 = df23.rename(columns = str.title)
print(df25)

In [None]:
df23.rename(columns = str.title, inplace = True)
print(df23)

## Binning

- Statistical data binning is a way to group a number of more or less continuous values into a smaller number of "bins"

In [None]:
ages = np.array([x for x in randint(13, 77, 20000000)], dtype = np.int32)
print(ages[0:10])
print(ages[-11:-1])

In [None]:
bin_points = [18, 25, 35, 50, 65, 77]
bins = pd.cut(ages, bin_points)
bins

Note the ISO standard representation of an interval: \[...\] denote an inclusive interval, (...) denote an exclusive interval. 

In [None]:
pd.value_counts(bins)

In [None]:
group_names = 'too_young some_what_old old very_old too_old'.split()
print(group_names)

In [None]:
bin_with_names = pd.cut(ages, bin_points, labels = group_names)
print(bin_with_names)

In [None]:
pd.value_counts(bin_with_names)

# End of Notebook <a class="tocSkip">