In [117]:
import pandas as pd

import numpy as np

import itertools as it

In [118]:
# creating hierachial index for a series

letters = ["a", "b", "c"]
numbers = [1, 2, 3]

combo = list(it.product(letters, numbers))

#combo_dict = dict(enumerate(list(combo)))

#combo.MultiIndex(levels = ("letter", "number"))
letters_long = [a for a, b in combo]
numbers_long = [b for a, b in combo]

#[letters_long, numbers_long]
dic = {"letter": letters_long, "number": numbers_long}
s = pd.Series(np.random.randint(0, 11, len(letters_long)), index=[letters_long, numbers_long], name="rando")
s.index.names = "letters", "numbers"

df = s.unstack(level=0)
df.columns = pd.MultiIndex.from_arrays([["Ohio", "Ohio", "Colorado"],
                          ["Green", "Red", "Green"]],
                          names=["state", "color"])



df.groupby(level = "color", axis=1).mean()

df = df.set_index(("Ohio", "Red")) # we can turn one of the columns into an index!

df.reset_index() # we can reverse this... and turn the index back into columns of the dataframe


  df.groupby(level = "color", axis=1).mean()


state,Ohio,Ohio,Colorado
color,Red,Green,Green
0,1,6,4
1,8,1,7
2,3,3,5


In [119]:
# combining dataframes

df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
    "data": pd.Series(range(7), dtype="Int64")})

df2 = pd.DataFrame({"key": ["a", "b", "d"],
    "data": pd.Series(range(3), dtype="Int64")})

pd.merge(df1, df2, how = "left", on = "key", indicator=True)



Unnamed: 0,key,data_x,data_y,_merge
0,b,0,1.0,both
1,b,1,1.0,both
2,a,2,0.0,both
3,c,3,,left_only
4,a,4,0.0,both
5,a,5,0.0,both
6,b,6,1.0,both


In [120]:
df1 = pd.DataFrame({"key": ["b", "b", "a", "c", "a", "a", "b"],
    "data": pd.Series(range(7), dtype="Int64")})

df2 = pd.DataFrame({"key": ["a", "b", "d"],
    "data": pd.Series(range(3), dtype="Int64")})

df2 = df2.set_index("key")

pd.merge(df1, df2, how = "left", left_on= "key", right_index=True, suffixes= ("_1", "_2"))

Unnamed: 0,key,data_1,data_2
0,b,0,1.0
1,b,1,1.0
2,a,2,0.0
3,c,3,
4,a,4,0.0
5,a,5,0.0
6,b,6,1.0


In [121]:
# concatenation of Series'
s1 = pd.Series(["a", "b", "c"], index = np.arange(1, 4), name = "series1")
s2 = pd.Series(["d", "e", "f"], index = range(3), name = "series2")

print(pd.concat([s1, s2], axis=1, join="inner"))
print("\n")
print(pd.concat([s1, s2], axis = 0))

nu_s = pd.concat([s1, s2, s2], keys = ["alpha", "beta", "gamma"])
print(nu_s.index)

nu_s.unstack(level=0)


  series1 series2
1       a       e
2       b       f


1    a
2    b
3    c
0    d
1    e
2    f
dtype: object
MultiIndex([('alpha', 1),
            ('alpha', 2),
            ('alpha', 3),
            ( 'beta', 0),
            ( 'beta', 1),
            ( 'beta', 2),
            ('gamma', 0),
            ('gamma', 1),
            ('gamma', 2)],
           )


Unnamed: 0,alpha,beta,gamma
0,,d,d
1,a,e,e
2,b,f,f
3,c,,


In [122]:
# concatenation of dataframes
df3 = df2.reset_index()
pd.concat([df3, df3], ignore_index=True, names=["alpha", "beta"]) # when indexes have no meaning, can ignore to reset



Unnamed: 0,key,data
0,a,0
1,b,1
2,d,2
3,a,0
4,b,1
5,d,2


In [None]:
# using combine_first() to blend two dataframes into one

df1_temp = df1.set_index(keys = "key")
df2_temp = df2

df1_temp["data"] = np.where(df1_temp.index == "a", np.nan, df1_temp["data"])

df1_temp.combine_first(df2_temp) # the combine_first essentially acts like a patching for values with the same column/row index lookups

Unnamed: 0_level_0,data
key,Unnamed: 1_level_1
a,0.0
a,0.0
a,0.0
b,0.0
b,1.0
b,6.0
c,3.0
d,2.0


In [197]:
# pivot() is equivalent to dplyr::pivot_wider()

# melt() is equivalent to dplyr::pivot_longer()

# stack() is also similar to dplyr::pivot_longer(), although this can be used for transforming a DataFrame() to a Series()
# unstack() is similar to dplyr::pivot_wider() but can convert Series() to DataFrame()

df1.pivot(columns="key", values="data")
dd = df2.assign(value = np.random.random_integers(0, 10, 3))
dd = dd.pivot(columns="data", values="value").reset_index()

rename_map = dict(enumerate(["alpha", "beta", "gamma"]))
dd = dd.rename(columns=rename_map)

dd = dd.melt(id_vars="key", value_vars=["alpha", "beta", "gamma"], value_name="magic_value")
dd = dd[dd["magic_value"].notna()]

dd


  dd = df2.assign(value = np.random.random_integers(0, 10, 3))


Unnamed: 0,key,data,magic_value
0,a,alpha,9.0
4,b,beta,10.0
8,d,gamma,8.0
