In [1]:
import pandas as pd
import numpy as np

In [2]:
def make_df(cols, idx):
    '''
    Quickly make a DataFrame
    '''
    data = {c:[str(c) + str(i) for i in idx] for c in cols}
    return pd.DataFrame(data, idx)

# example dataframe
make_df('ABC', range(3))

Unnamed: 0,A,B,C
0,A0,B0,C0
1,A1,B1,C1
2,A2,B2,C2


In [3]:
class display(object):
    """Display HTML representation of multiple objects"""
    template = """<div style="float: left; padding: 10px;">
    <p style='font-family:"Courier New", Courier, monospace'>{0}</p>{1}
    </div>"""
    def __init__(self, *args):
        self.args = args
        
    def _repr_html_(self):
        return '\n'.join(self.template.format(a, eval(a)._repr_html_())
                         for a in self.args)
    
    def __repr__(self):
        return '\n\n'.join(a + '\n' + repr(eval(a))
                           for a in self.args)
    

# Recall: Concatenation of NumPy Arrays

In [4]:
x = [1,2,3]
y = [4,5,6]
z = [7,8,9]

# Question: How would you combine them together horizontically? Vertically?

# Simple Concatenation with `pd.concat`

In [5]:
# signature in Pandas v0.18
pd.concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False,
          keys=None, levels=None, names=None, verify_integrity=False, copy=True)

NameError: name 'objs' is not defined

In [6]:
ser1 = pd.Series(['A', 'B', 'C'], index=[1,2,3])
ser2 = pd.Series(['D', 'E', 'F'], index=[4,5,6])
pd.concat([ser1, ser2])

1    A
2    B
3    C
4    D
5    E
6    F
dtype: object

In [7]:
df1 = make_df('AB', [1,2])
df2 = make_df('AB', [3,4])
display('df1', 'df2', 'pd.concat([df1, df2])')

# Question: What is the default axis?

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4


In [8]:
df3 = make_df('AB', [0,1])
df4 = make_df('CD', [0,1])

# Question: How do you concatenate them horizontally? 


## Duplicate indices

In [9]:
x = make_df('AB', [0,1])
y = make_df('AB', [2,3])
y.index = x.index # make duplicate indices! 
display('x', 'y', 'pd.concat([x,y])')

# Question: Is this what we wanted? 

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
0,A2,B2
1,A3,B3


### Catching the repeats as an error

In [10]:
# verify indices in the result of pd.concat do not overlap
try:
    pd.concat([x,y], verify_integrity=True)
except ValueError as e:
    print("ValueError:", e)

ValueError: Indexes have overlapping values: Int64Index([0, 1], dtype='int64')


### Ignoring the index

In [11]:
# concatenation will create a new integer index
display('x', 'y', 'pd.concat([x,y], ignore_index=True)')

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
0,A2,B2
1,A3,B3

Unnamed: 0,A,B
0,A0,B0
1,A1,B1
2,A2,B2
3,A3,B3


### Adding MultiIndex keys

In [12]:
y = make_df('AB', [1,2,3])

In [13]:
display('x', 'y', "pd.concat([x,y], keys=['x','y'])")

Unnamed: 0,A,B
0,A0,B0
1,A1,B1

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3

Unnamed: 0,Unnamed: 1,A,B
x,0,A0,B0
x,1,A1,B1
y,1,A1,B1
y,2,A2,B2
y,3,A3,B3


## Concatenation with joins

Dealing with different sets of column names

In [14]:
df5 = make_df('ABC', [1,2])
df6 = make_df('BCD', [3,4])
display('df5', 'df6', 'pd.concat([df5, df6])')

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,A,B,C,D
1,A1,B1,C1,
2,A2,B2,C2,
3,,B3,C3,D3
4,,B4,C4,D4


By default, the entries with no data available are filled with NA values. We can change options for the `join` and `join_axes` parameters of the concatenate function. By default, join is *union* of input columns (`join='outer'`), but we can change this to an intersection of the columns using `join='inner'`

In [15]:
# compare, what is the difference between this and previous output?
display('df5', 'df6', "pd.concat([df5, df6], join='inner')")

Unnamed: 0,A,B,C
1,A1,B1,C1
2,A2,B2,C2

Unnamed: 0,B,C,D
3,B3,C3,D3
4,B4,C4,D4

Unnamed: 0,B,C
1,B1,C1
2,B2,C2
3,B3,C3
4,B4,C4


## The `append()` method

In [18]:
# Is this inplace?
display('df1', 'df2', 'df1.append(df2)')

Unnamed: 0,A,B
1,A1,B1
2,A2,B2

Unnamed: 0,A,B
3,A3,B3
4,A4,B4

Unnamed: 0,A,B
1,A1,B1
2,A2,B2
3,A3,B3
4,A4,B4
