In [1]:
#some generic imports

%matplotlib inline 
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import numpy as np
import pandas as pd
from scipy import stats
from scipy import special


#### Function application

To apply your own or another library’s functions to pandas objects, you should be aware of the three methods below. The appropriate method to use depends on whether your function expects to operate on an entire `DataFrame` or `Series`, row- or column-wise, or elementwise.

 - Tablewise Function Application: `pipe()`
 - Row or Column-wise Function Application: `apply()`
 - Aggregation API: `agg()` and `transform()`
 - Applying Elementwise Functions: `applymap()`


#### `pipe()`

`DataFrame.pipe(func, *args, **kwargs)`

Use `.pipe` when chaining together functions that expect Series, DataFrames or GroupBy objects. Instead of writing

    func(g(h(df), arg1=a), arg2=b, arg3=c)  

You can write
```
(df.pipe(h)
.pipe(g, arg1=a)
.pipe(func, arg2=b, arg3=c))  
```

If you have a function that takes the data as (say) the second argument, pass a tuple indicating which keyword expects the data. For example, suppose `func` takes its data as `arg2`:

```
(df.pipe(h)
   .pipe(g, arg1=a)
   .pipe((func, 'arg2'), arg1=a, arg3=c))  
```

In [3]:
#interesting example from Pandas docs

import statsmodels.formula.api as sm

bb = pd.read_csv("data/baseball.csv", index_col="id")

In [4]:
bb.sample(5)

Unnamed: 0_level_0,year,stint,team,lg,g,ab,r,h,X2b,X3b,...,rbi,sb,cs,bb,so,ibb,hbp,sh,sf,gidp
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
caminke01,2001,2,ATL,NL,64,171,12,38,9,0,...,16.0,0.0,1.0,21,44.0,1.0,0.0,0.0,1.0,2.0
davalvi01,1964,1,CLE,AL,150,577,64,156,26,2,...,51.0,21.0,11.0,34,77.0,2.0,1.0,4.0,6.0,4.0
ansonca01,1877,1,CHN,NL,59,255,52,86,19,1,...,32.0,,,9,3.0,,,,,
freehbi01,1964,1,DET,AL,144,520,69,156,14,8,...,80.0,5.0,1.0,36,68.0,3.0,8.0,1.0,7.0,8.0
mitchke01,1988,1,SFN,NL,148,505,60,127,25,7,...,80.0,5.0,5.0,48,85.0,7.0,5.0,1.0,7.0,9.0


In [6]:
(bb.query("h > 0")
    .assign(ln_h=lambda df: np.log(df.h))
    .pipe((sm.ols, "data"), "hr ~ ln_h + year + g + C(lg)")
    .fit()
    .summary())

0,1,2,3
Dep. Variable:,hr,R-squared:,0.458
Model:,OLS,Adj. R-squared:,0.458
Method:,Least Squares,F-statistic:,1926.0
Date:,"Fri, 11 Feb 2022",Prob (F-statistic):,0.0
Time:,12:10:58,Log-Likelihood:,-60863.0
No. Observations:,18236,AIC:,121700.0
Df Residuals:,18227,BIC:,121800.0
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-132.5821,3.090,-42.906,0.000,-138.639,-126.525
C(lg)[T.AL],-0.7862,0.541,-1.454,0.146,-1.846,0.274
C(lg)[T.FL],-1.0497,1.266,-0.829,0.407,-3.530,1.431
C(lg)[T.NL],-1.1781,0.539,-2.187,0.029,-2.234,-0.122
C(lg)[T.PL],0.1840,1.313,0.140,0.889,-2.390,2.758
C(lg)[T.UA],2.4496,2.628,0.932,0.351,-2.701,7.600
ln_h,0.4191,0.071,5.886,0.000,0.280,0.559
year,0.0663,0.002,41.513,0.000,0.063,0.069
g,0.1028,0.002,48.636,0.000,0.099,0.107

0,1,2,3
Omnibus:,6196.996,Durbin-Watson:,1.907
Prob(Omnibus):,0.0,Jarque-Bera (JB):,30705.738
Skew:,1.574,Prob(JB):,0.0
Kurtosis:,8.523,Cond. No.,120000.0


#### Row or column-wise function application

Arbitrary functions can be applied along the axes of a DataFrame using the `apply()` method, which, like the descriptive statistics methods, takes an optional axis argument:

In [8]:
df = pd.DataFrame(np.random.randint(10, size=(5,3)), columns = list('abc'))
df

Unnamed: 0,a,b,c
0,1,1,2
1,1,4,3
2,2,0,1
3,8,7,0
4,8,5,9


In [11]:
df.apply('mean') #The apply() method will also dispatch on a string method name.

a    4.0
b    3.4
c    3.0
dtype: float64

In [9]:
df.apply(np.mean, axis = 1)

0    1.333333
1    2.666667
2    1.000000
3    5.000000
4    7.333333
dtype: float64

In [10]:

df.apply(lambda x: x.max() - x.min())

a    7
b    7
c    9
dtype: int64

In [12]:
tsdf = pd.DataFrame(
    np.random.randn(1000, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=1000),)

tsdf.sample(10)

Unnamed: 0,A,B,C
2000-06-29,0.71975,-0.318276,-0.340285
2002-03-04,0.947739,0.256004,-0.256581
2000-08-07,1.13251,0.181683,1.74162
2000-03-31,1.266796,-0.23436,-0.056377
2002-09-04,-1.28578,-0.357419,-1.005903
2002-02-26,0.42947,0.435401,-1.634053
2001-02-01,-2.352943,0.925379,0.325769
2001-05-14,-0.485298,-1.514045,-0.211075
2001-05-04,-0.1986,-1.726664,0.580908
2002-08-13,-0.461348,-0.347306,1.208834


In [13]:
tsdf.apply(lambda x: x.idxmax())

A   2000-07-02
B   2001-10-23
C   2001-06-18
dtype: datetime64[ns]

In [17]:
tsdf[tsdf['A'] == tsdf['A'].max()]

Unnamed: 0,A,B,C
2000-07-02,2.806975,-0.648051,-0.255058


You may also pass additional arguments and keyword arguments to the `apply()` method. For instance, consider the following function you would like to apply:

In [22]:
def diffs(x, y =5):
    return x - y

In [24]:
df.apply(diffs, y = 6)

Unnamed: 0,a,b,c
0,-5,-5,-4
1,-5,-2,-3
2,-4,-6,-5
3,2,1,-6
4,2,-1,3


#### Aggregation API

The aggregation API allows one to express possibly multiple aggregation operations in a single concise way. This API is similar across pandas objects, see `groupby` API, the `window` API, and the `resample` API. The entry point for aggregation is `DataFrame.aggregate()`, or the alias `DataFrame.agg()`.

You can use function with -

 - function name  like `np.mean`
 - or string like `'sum'`
 - or a list of functions like `['sum', 'mean']`
 - or a dict where functions are applied to columns like `{'A': 'sum', 'B':'mean'}`
 
 

In [28]:
df.agg(np.mean)

a    4.0
b    3.4
c    3.0
dtype: float64

In [29]:
df.agg('sum')

a    20
b    17
c    15
dtype: int64

In [30]:
df.agg(['mean','sum'])

Unnamed: 0,a,b,c
mean,4.0,3.4,3.0
sum,20.0,17.0,15.0


In [32]:
df.agg({'a':'sum','b':'mean'})

a    20.0
b     3.4
dtype: float64

In [33]:
df.agg(["sum", lambda x: x.mean()]) 

Unnamed: 0,a,b,c
sum,20.0,17.0,15.0
<lambda>,4.0,3.4,3.0


When presented with mixed dtypes that cannot aggregate, `.agg` will only take the valid aggregations.


In [34]:
mdf = pd.DataFrame(
    {
        "A": [1, 2, 3],
        "B": [1.0, 2.0, 3.0],
        "C": ["foo", "bar", "baz"],
        "D": pd.date_range("20130101", periods=3),
    })

mdf.agg(["min", "sum"])


Unnamed: 0,A,B,C,D
min,1,1.0,bar,2013-01-01
sum,6,6.0,foobarbaz,NaT


#### Custom describe

With `.agg()` it is possible to easily create a custom describe function, similar to the built in `describe` function.

In [39]:
tsdf = pd.DataFrame(
    np.random.randn(10, 3),
    columns=["A", "B", "C"],
    index=pd.date_range("1/1/2000", periods=10),)

tsdf.iloc[3:7] = np.nan

tsdf.sample(3)

Unnamed: 0,A,B,C
2000-01-06,,,
2000-01-09,0.577796,-1.473343,-0.614003
2000-01-10,-1.02804,-0.911955,0.396333


In [36]:
from functools import partial

q_25 = partial(pd.Series.quantile, q=0.25)
q_25.__name__ = "25%"

q_75 = partial(pd.Series.quantile, q=0.75)
q_75.__name__ = "75%"

tsdf.agg(["count", "mean", "std", "min", q_25, "median", q_75, "max"])

Unnamed: 0,A,B,C
count,6.0,6.0,6.0
mean,0.216294,-0.281645,0.305016
std,0.802575,0.877109,0.854326
min,-1.032122,-1.194835,-0.725437
25%,-0.246627,-0.819721,-0.419717
median,0.410125,-0.511378,0.487467
75%,0.783343,0.028711,0.796062
max,1.067431,1.241485,1.403783


#### Transform API

The `transform()` method returns an object that is indexed the same (same size) as the original. This API allows you to provide multiple operations at the same time rather than one-by-one. Its API is quite similar to the `.agg` API.

In [40]:
tsdf.transform(np.abs)  # equivalent to tsdf.transform('abs')

Unnamed: 0,A,B,C
2000-01-01,1.091125,0.014696,0.847385
2000-01-02,1.791508,1.29278,0.798121
2000-01-03,0.155854,0.636252,1.337663
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.223031,0.075673,1.005082
2000-01-09,0.577796,1.473343,0.614003
2000-01-10,1.02804,0.911955,0.396333


In [44]:
tsdf.transform(lambda x: x.abs())

Unnamed: 0,A,B,C
2000-01-01,1.091125,0.014696,0.847385
2000-01-02,1.791508,1.29278,0.798121
2000-01-03,0.155854,0.636252,1.337663
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.223031,0.075673,1.005082
2000-01-09,0.577796,1.473343,0.614003
2000-01-10,1.02804,0.911955,0.396333


In [45]:
tsdf["A"].transform(np.abs)

2000-01-01    1.091125
2000-01-02    1.791508
2000-01-03    0.155854
2000-01-04         NaN
2000-01-05         NaN
2000-01-06         NaN
2000-01-07         NaN
2000-01-08    1.223031
2000-01-09    0.577796
2000-01-10    1.028040
Freq: D, Name: A, dtype: float64

#### Transform with multiple functions

Passing multiple functions will yield a column MultiIndexed DataFrame. The first level will be the original frame column names; the second level will be the names of the transforming functions.

In [46]:
tsdf.transform([np.abs, lambda x: x + 1])

Unnamed: 0_level_0,A,A,B,B,C,C
Unnamed: 0_level_1,absolute,<lambda>,absolute,<lambda>,absolute,<lambda>
2000-01-01,1.091125,-0.091125,0.014696,0.985304,0.847385,0.152615
2000-01-02,1.791508,-0.791508,1.29278,2.29278,0.798121,1.798121
2000-01-03,0.155854,1.155854,0.636252,0.363748,1.337663,2.337663
2000-01-04,,,,,,
2000-01-05,,,,,,
2000-01-06,,,,,,
2000-01-07,,,,,,
2000-01-08,1.223031,-0.223031,0.075673,1.075673,1.005082,2.005082
2000-01-09,0.577796,1.577796,1.473343,-0.473343,0.614003,0.385997
2000-01-10,1.02804,-0.02804,0.911955,0.088045,0.396333,1.396333


Passing multiple functions to a Series will yield a DataFrame. The resulting column names will be the transforming functions.

In [47]:
tsdf["A"].transform([np.abs, lambda x: x + 1])

Unnamed: 0,absolute,<lambda>
2000-01-01,1.091125,-0.091125
2000-01-02,1.791508,-0.791508
2000-01-03,0.155854,1.155854
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.223031,-0.223031
2000-01-09,0.577796,1.577796
2000-01-10,1.02804,-0.02804


#### Transforming with a dict

Passing a dict of functions will allow selective transforming per column.

In [48]:
tsdf.transform({"A": np.abs, "B": lambda x: x + 1})

Unnamed: 0,A,B
2000-01-01,1.091125,0.985304
2000-01-02,1.791508,2.29278
2000-01-03,0.155854,0.363748
2000-01-04,,
2000-01-05,,
2000-01-06,,
2000-01-07,,
2000-01-08,1.223031,1.075673
2000-01-09,0.577796,-0.473343
2000-01-10,1.02804,0.088045


Passing a dict of lists will generate a MultiIndexed DataFrame with these selective transforms.

In [49]:
tsdf.transform({"A": np.abs, "B": [lambda x: x + 1, "sqrt"]})

  result = getattr(ufunc, method)(*inputs, **kwargs)


Unnamed: 0_level_0,A,B,B
Unnamed: 0_level_1,absolute,<lambda>,sqrt
2000-01-01,1.091125,0.985304,
2000-01-02,1.791508,2.29278,1.137005
2000-01-03,0.155854,0.363748,
2000-01-04,,,
2000-01-05,,,
2000-01-06,,,
2000-01-07,,,
2000-01-08,1.223031,1.075673,0.275087
2000-01-09,0.577796,-0.473343,
2000-01-10,1.02804,0.088045,


#### Applying elementwise functions

Since not all functions can be vectorized (accept NumPy arrays and return another array or value), the methods `applymap()` on DataFrame and analogously `map()` on Series accept any Python function taking a single value and returning a single value. For example:

In [51]:
def f(x):

    return len(str(x))

In [54]:
tsdf["A"].map(f)

2000-01-01    19
2000-01-02    18
2000-01-03    19
2000-01-04     3
2000-01-05     3
2000-01-06     3
2000-01-07     3
2000-01-08    19
2000-01-09    18
2000-01-10    19
Freq: D, Name: A, dtype: int64

In [55]:
tsdf.applymap(f)

Unnamed: 0,A,B,C
2000-01-01,19,21,19
2000-01-02,18,18,18
2000-01-03,19,19,18
2000-01-04,3,3,3
2000-01-05,3,3,3
2000-01-06,3,3,3
2000-01-07,3,3,3
2000-01-08,19,19,18
2000-01-09,18,19,19
2000-01-10,19,19,19


`Series.map()` has an additional feature; it can be used to easily “link” or “map” values defined by a secondary series. This is closely related to merging/joining functionality:

In [56]:
s = pd.Series(
    ["six", "seven", "six", "seven", "six"], index=["a", "b", "c", "d", "e"])

t = pd.Series({"six": 6.0, "seven": 7.0})

s.map(t)

a    6.0
b    7.0
c    6.0
d    7.0
e    6.0
dtype: float64

#### Concatenation

Like its sibling function on ndarrays, `numpy.concatenate`, `pandas.concat` takes a list or dict of homogeneously-typed objects and concatenates them with some configurable handling of “what to do with the other axes”:
```
pd.concat(
    objs,
    axis=0,
    join="outer",
    ignore_index=False,
    keys=None,
    levels=None,
    names=None,
    verify_integrity=False,
    copy=True,)
```    

In [58]:
df1 = pd.DataFrame({"A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"], 
                    "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"],}, index=[0, 1, 2, 3],)

df2 = pd.DataFrame({"A": ["A4", "A5", "A6", "A7"],"B": ["B4", "B5", "B6", "B7"], 
                    "C": ["C4", "C5", "C6", "C7"], "D": ["D4", "D5", "D6", "D7"],}, index=[4, 5, 6, 7],)

df3 = pd.DataFrame({"A": ["A8", "A9", "A10", "A11"], "B": ["B8", "B9", "B10", "B11"], 
                    "C": ["C8", "C9", "C10", "C11"], "D": ["D8", "D9", "D10", "D11"],},index=[8, 9, 10, 11],)

frames = [df1, df2, df3]

result = pd.concat(frames)
result

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7
8,A8,B8,C8,D8
9,A9,B9,C9,D9


In [59]:
result = pd.concat(frames, keys=["x", "y", "z"])
result


Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


 The resulting object’s index has a hierarchical index. This means that we can now select out each chunk by key:

In [60]:
result.loc["y"]

Unnamed: 0,A,B,C,D
4,A4,B4,C4,D4
5,A5,B5,C5,D5
6,A6,B6,C6,D6
7,A7,B7,C7,D7


#### Set logic on the other axes

When gluing together multiple DataFrames, you have a choice of how to handle the other axes (other than the one being concatenated). This can be done in the following two ways:

 - Take the union of them all, `join='outer'`. This is the default option as it results in zero information loss.
 - Take the intersection, `join='inner'`.

Here is an example of each of these methods. First, the default `join='outer'` behavior:

In [62]:
df4 = pd.DataFrame({"B": ["B2", "B3", "B6", "B7"], "D": ["D2", "D3", "D6", "D7"], 
                    "F": ["F2", "F3", "F6", "F7"],}, index=[2, 3, 6, 7],)

result = pd.concat([df1, df4], axis=1)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3
6,,,,,B6,D6,F6
7,,,,,B7,D7,F7


In [65]:
result = pd.concat([df1, df4], join = 'inner', axis=1)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


Lastly, suppose we just wanted to reuse the exact index from the original DataFrame:

In [67]:
result = pd.concat([df1, df4], axis=1).reindex(df1.index)
result

Unnamed: 0,A,B,C,D,B.1,D.1,F
0,A0,B0,C0,D0,,,
1,A1,B1,C1,D1,,,
2,A2,B2,C2,D2,B2,D2,F2
3,A3,B3,C3,D3,B3,D3,F3


#### Ignoring indexes on the concatenation axis

For DataFrame objects which don’t have a meaningful index, you may wish to append them and ignore the fact that they may have overlapping indexes. To do this, use the `ignore_index` argument:

In [68]:
result = pd.concat([df1, df4], ignore_index=True, sort=False)

In [69]:
result


Unnamed: 0,A,B,C,D,F
0,A0,B0,C0,D0,
1,A1,B1,C1,D1,
2,A2,B2,C2,D2,
3,A3,B3,C3,D3,
4,,B2,,D2,F2
5,,B3,,D3,F3
6,,B6,,D6,F6
7,,B7,,D7,F7


#### Concatenating with mixed ndims

You can concatenate a mix of Series and DataFrame objects. The Series will be transformed to DataFrame with the column name as the name of the Series.

In [72]:
s1 = pd.Series(["X0", "X1", "X2", "X3"], name="X")

pd.concat([df1, s1], axis=1)

Unnamed: 0,A,B,C,D,X
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


In [75]:
s2 = pd.Series(["X0", "X1", "X2", "X3"]) #notice the lack of Series 'name'

pd.concat([df1, s2], axis=1) #notice the column name in this case

Unnamed: 0,A,B,C,D,0
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


In [76]:
#Passing `ignore_index=True` will drop all name references.
#note the all the columns names are gone. This doesn't happen when we concatenate 2 dataframes

pd.concat([df1, s1], axis=1, ignore_index=True)

Unnamed: 0,0,1,2,3,4
0,A0,B0,C0,D0,X0
1,A1,B1,C1,D1,X1
2,A2,B2,C2,D2,X2
3,A3,B3,C3,D3,X3


A fairly common use of the `keys` argument is to override the column names when creating a new DataFrame based on existing Series. Notice how the default behaviour consists on letting the resulting DataFrame inherit the parent Series’ name, when these existed.

In [77]:
s3 = pd.Series([0, 1, 2, 3], name="foo")
s4 = pd.Series([0, 1, 2, 3])
s5 = pd.Series([0, 1, 4, 5])

pd.concat([s3, s4, s5], axis=1)

Unnamed: 0,foo,0,1
0,0,0,0
1,1,1,1
2,2,2,4
3,3,3,5


In [78]:
#keys argument version

pd.concat([s3, s4, s5], axis=1, keys=["red", "blue", "yellow"])

Unnamed: 0,red,blue,yellow
0,0,0,0
1,1,1,1
2,2,2,4
3,3,3,5


You can also pass a dict to `concat` in which case the dict keys will be used for the keys argument (unless other keys are specified):

In [79]:
pieces = {"x": df1, "y": df2, "z": df3}
pd.concat(pieces)

Unnamed: 0,Unnamed: 1,A,B,C,D
x,0,A0,B0,C0,D0
x,1,A1,B1,C1,D1
x,2,A2,B2,C2,D2
x,3,A3,B3,C3,D3
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9


In [81]:
result = pd.concat(pieces, keys=["z", "y"])
result

Unnamed: 0,Unnamed: 1,A,B,C,D
z,8,A8,B8,C8,D8
z,9,A9,B9,C9,D9
z,10,A10,B10,C10,D10
z,11,A11,B11,C11,D11
y,4,A4,B4,C4,D4
y,5,A5,B5,C5,D5
y,6,A6,B6,C6,D6
y,7,A7,B7,C7,D7


The MultiIndex created has levels that are constructed from the passed keys and the index of the DataFrame pieces:

In [82]:
result.index.levels

FrozenList([['z', 'y'], [4, 5, 6, 7, 8, 9, 10, 11]])

#### Appending rows to a DataFrame

If you have a series that you want to append as a single row to a DataFrame, you can convert the row into a DataFrame and use `concat`

In [83]:
s2 = pd.Series(["X0", "X1", "X2", "X3"], index=["A", "B", "C", "D"])

pd.concat([df1, s2.to_frame().T], ignore_index=True)

Unnamed: 0,A,B,C,D
0,A0,B0,C0,D0
1,A1,B1,C1,D1
2,A2,B2,C2,D2
3,A3,B3,C3,D3
4,X0,X1,X2,X3


In [85]:
s2.to_frame().T 

Unnamed: 0,A,B,C,D
0,X0,X1,X2,X3


#### Database-style DataFrame or named Series joining/merging

pandas provides a single function, `merge()`, as the entry point for all standard database join operations between `DataFrame` or named `Series` objects:

```
pd.merge(
    left,
    right,
    how="inner",
    on=None,
    left_on=None,
    right_on=None,
    left_index=False,
    right_index=False,
    sort=True,
    suffixes=("_x", "_y"),
    copy=True,
    indicator=False,
    validate=None,)
```    

In [87]:
left = pd.DataFrame({"key": ["K0", "K1", "K2", "K3"], "A": ["A0", "A1", "A2", "A3"], "B": ["B0", "B1", "B2", "B3"],})

right = pd.DataFrame({"key": ["K0", "K1", "K2", "K3"], "C": ["C0", "C1", "C2", "C3"], "D": ["D0", "D1", "D2", "D3"],})

left, right

(  key   A   B
 0  K0  A0  B0
 1  K1  A1  B1
 2  K2  A2  B2
 3  K3  A3  B3,
   key   C   D
 0  K0  C0  D0
 1  K1  C1  D1
 2  K2  C2  D2
 3  K3  C3  D3)

In [89]:
pd.merge(left, right) #by default, intersection of cols

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [88]:
pd.merge(left, right, on="key") #'key' col must be present in both dataframes

Unnamed: 0,key,A,B,C,D
0,K0,A0,B0,C0,D0
1,K1,A1,B1,C1,D1
2,K2,A2,B2,C2,D2
3,K3,A3,B3,C3,D3


In [None]:
pd.merge(left, right, on="A") #Error, as 'A' is absent in second dataframe

In [21]:
data1 = pd.DataFrame({
    'Name': ['Sam', 'Peter', 'Lisa', 'Ryan', 'Martha', 'Bruce'],
    'Dept': ['Acct', 'Acct', 'Admin', 'IT', 'IT', 'HR'],
    'Area': ['Texas', 'San Diego', 'Washington', 'San Fra.', 'Ney York', 'Los Angles']})

data2 = pd.DataFrame({
    'Name': ['Sam', 'Peter', 'Lisa', 'Ryan', 'Martha', 'Bruce', 'Matt'],
    'Dept': ['Acct', 'Acct', 'Admin', 'IT', 'IT', 'HR', 'Legal'],
    'Area': ['Texas', 'San Diego', 'Washington', 'San Fra.', 'Ney York', 'Los Angles', 'Omaha'],
    'Grade':[1,2,3,4,4,2,1]

})

pd.merge(data1, data2)

Unnamed: 0,Name,Dept,Area,Grade
0,Sam,Acct,Texas,1
1,Peter,Acct,San Diego,2
2,Lisa,Admin,Washington,3
3,Ryan,IT,San Fra.,4
4,Martha,IT,Ney York,4
5,Bruce,HR,Los Angles,2


In [23]:
pd.merge(data1, data2, how = 'outer')

Unnamed: 0,Name,Dept,Area,Grade
0,Sam,Acct,Texas,1
1,Peter,Acct,San Diego,2
2,Lisa,Admin,Washington,3
3,Ryan,IT,San Fra.,4
4,Martha,IT,Ney York,4
5,Bruce,HR,Los Angles,2
6,Matt,Legal,Omaha,1


In [30]:
pd.merge(data1, data2, on = 'Name', how = 'left')

Unnamed: 0,Name,Dept_x,Area_x,Dept_y,Area_y,Grade
0,Sam,Acct,Texas,Acct,Texas,1
1,Peter,Acct,San Diego,Acct,San Diego,2
2,Lisa,Admin,Washington,Admin,Washington,3
3,Ryan,IT,San Fra.,IT,San Fra.,4
4,Martha,IT,Ney York,IT,Ney York,4
5,Bruce,HR,Los Angles,HR,Los Angles,2


In [35]:
pd.merge(data1,data2, left_on = 'Name', right_on = 'Dept', how = 'outer')

Unnamed: 0,Name_x,Dept_x,Area_x,Name_y,Dept_y,Area_y,Grade
0,Sam,Acct,Texas,,,,
1,Peter,Acct,San Diego,,,,
2,Lisa,Admin,Washington,,,,
3,Ryan,IT,San Fra.,,,,
4,Martha,IT,Ney York,,,,
5,Bruce,HR,Los Angles,,,,
6,,,,Sam,Acct,Texas,1.0
7,,,,Peter,Acct,San Diego,2.0
8,,,,Lisa,Admin,Washington,3.0
9,,,,Ryan,IT,San Fra.,4.0


The `pd.merge` section needs to be redone in line with Python Data Science Handbook. 

Also other methods like `df.join` etc need to be added. 