# 01. Importing Libraries

In [1]:
import pandas as pd
import numpy as np
import os

# 02. Create data to experiment on

In [9]:
# Define a data dictionary containing January 2020 data
data1 = {'customer_id' :[6732,767,890,635],
        'month' : ['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
        'purchased_meat' : [0,13,3,4],
        'purchased_alcohol' : [1,2,10,0],
        'purchased_snacks' : [10,5,1,7]}

In [13]:
# Define a data dictionary containing February 2020 data
data2 = {'customer_id' :[6732,767,890,635],
        'month' : ['Feb-20', 'Feb-20', 'Feb-20', 'Feb-20'],
        'purchased_meat' : [0,10,5,3],
        'purchased_alcohol' : [2,4,14,0],
        'purchased_snacks' : [15,3,2,6]}

In [21]:
# Convert the dictionaries into dataframes
df = pd.DataFrame(data1, index=[0,1,2,3])

df_1 = pd.DataFrame(data2, index=[0,1,2,3])

In [22]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [23]:
df_1

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


# 03. Concatenating Data
- Is suitable for rows or columns of the same length.
- Will place dataframes on top of each other by default (`axis = 0`). Wide format ('axis = 1')
- Requires a list as its main argument (this is why the `frames` list was created first in the example above).

In [30]:
# Assign dataframes to object as list for concatenation
frames = [df,df_1]

In [26]:
# Concatenate the dataframes
df_concat = pd.concat(frames)

In [27]:
df_concat

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [28]:
df_concat_wide = pd.concat(frames, axis =1)

In [29]:
df_concat_wide

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,customer_id.1,month.1,purchased_meat.1,purchased_alcohol.1,purchased_snacks.1
0,6732,Jan-20,0,1,10,6732,Feb-20,0,2,15
1,767,Jan-20,13,2,5,767,Feb-20,10,4,3
2,890,Jan-20,3,10,1,890,Feb-20,5,14,2
3,635,Jan-20,4,0,7,635,Feb-20,3,0,6


# 04. Appending Data
- Is used to create long-format dataframes and can only be used on dataframes with the same numbers of columns.
- Is a direct function that takes the format `df.append(df_new_rows)` (different from the syntax for `pd.concat()`).

In [31]:
df_appended = df.append(df_1)
df_appended

  df_appended = df.append(df_1)


Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7
0,6732,Feb-20,0,2,15
1,767,Feb-20,10,4,3
2,890,Feb-20,5,14,2
3,635,Feb-20,3,0,6


In [32]:
# Create data with different number of columns that df
data3 = {'customer_id' : [6732,767,890,635],
        'month' : ['Jan-20', 'Jan-20', 'Jan-20', 'Jan-20'],
        'days_purchased_on' : [0,13,3,4]}

In [37]:
# Convert to dataframe
df_2 = pd.DataFrame(data3, index = [0,1,2,3])
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,13
2,890,Jan-20,3
3,635,Jan-20,4


In [40]:
# Next, use the append function to append df_2 onto df and check
df_appended_test = df.append(df_2)
df_appended_test

  df_appended_test = df.append(df_2)


Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on
0,6732,Jan-20,0.0,1.0,10.0,
1,767,Jan-20,13.0,2.0,5.0,
2,890,Jan-20,3.0,10.0,1.0,
3,635,Jan-20,4.0,0.0,7.0,
0,6732,Jan-20,,,,0.0
1,767,Jan-20,,,,13.0
2,890,Jan-20,,,,3.0
3,635,Jan-20,,,,4.0


# 05. Merging Data

In [49]:
# # Merge df and df_2 using customer_id and month as a keys, add a merge flag
df_merged = df.merge(df_2, on = ['customer_id', 'month'], indicator = True)
df_merged

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both


In [43]:
df

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks
0,6732,Jan-20,0,1,10
1,767,Jan-20,13,2,5
2,890,Jan-20,3,10,1
3,635,Jan-20,4,0,7


In [44]:
df_2

Unnamed: 0,customer_id,month,days_purchased_on
0,6732,Jan-20,0
1,767,Jan-20,13
2,890,Jan-20,3
3,635,Jan-20,4


In [51]:
# run frequency count on merge column
df_merged['_merge'].value_counts()

both          4
left_only     0
right_only    0
Name: _merge, dtype: int64

In [52]:
# test a merge without actually saving it to a new dataframe (or overwriting your current dataframe)
pd.merge(df,df_2, on = ['customer_id', 'month'], indicator = True)

Unnamed: 0,customer_id,month,purchased_meat,purchased_alcohol,purchased_snacks,days_purchased_on,_merge
0,6732,Jan-20,0,1,10,0,both
1,767,Jan-20,13,2,5,13,both
2,890,Jan-20,3,10,1,3,both
3,635,Jan-20,4,0,7,4,both
