In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

from numpy.random import randn
from numpy.random import randint

# Summary

- Merge
- Merge on Index
- Concatenate
- Combining DF
- Reshaping
- Pivoting
- Duplicates in DFs
- Mapping
- Replace
- Rename Index
- Binning
- Outliers
- Permutation

# Development

### Merge - which is the JOIN - left, right, inner, outer

In [3]:
# Merge rows as keys
dframe1 = DataFrame({'key':['X','X','Y','Y','Z','Z'],'data_set_1':np.arange(6)})
print(dframe1)

dframe2 = DataFrame({'key':['Q','Y','Z'],'data_set_2':[100,200,300]})
print(dframe2) 

  key  data_set_1
0   X           0
1   X           1
2   Y           2
3   Y           3
4   Z           4
5   Z           5
  key  data_set_2
0   Q         100
1   Y         200
2   Z         300


In [4]:
# Merge is a MANY to ONE method 
print(pd.merge(dframe1,dframe2))
# Merge on a specific column use 'ON='
print(pd.merge(dframe1,dframe2,on='key'))
print(pd.merge(dframe1,dframe2,on='key',how='outer'))

  key  data_set_1  data_set_2
0   Y           2         200
1   Y           3         200
2   Z           4         300
3   Z           5         300
  key  data_set_1  data_set_2
0   Y           2         200
1   Y           3         200
2   Z           4         300
3   Z           5         300
  key  data_set_1  data_set_2
0   X         0.0         NaN
1   X         1.0         NaN
2   Y         2.0       200.0
3   Y         3.0       200.0
4   Z         4.0       300.0
5   Z         5.0       300.0
6   Q         NaN       100.0


In [5]:
# MERGE WITH MULTIPLE KEYS
# We can also merge with multiple keys!

# Dframe on left
df_left = DataFrame({'key1': ['Monte', 'Trevi', 'Signo'],
                  'key2': ['Centro', 'Centro', 'Centro'],
                  'left_data': [10,20,30]})

print(df_left)
#Dframe on right
df_right = DataFrame({'key1': ['Monte', 'Monte', 'Trevi', 'Trevi'],
                   'key2': ['Centro', 'Peri', 'Centro', 'Peri'],
                   'right_data': [10,50,20,70]})
print(df_right)

#Merge
print('Merge on Key one and Key two OUTER')
print(pd.merge(df_left, df_right, on=['key1', 'key2'], how='outer'))

# print('---')
# print('Merge only on one Key with suffixes')
# print(pd.merge(df_left, df_right, on=['key1'], suffixes=('_lefty','_righty')))

print('---')
print('Merge on two keys INNER')
print(pd.merge(df_left, df_right, on=['key1', 'key2'], how='inner'))
print('---')
print('Merge on key1 OUTER')
print(pd.merge(df_left, df_right, on=['key1'], how='outer'))
print('---')
print('Merge on key2 OUTER')
print(pd.merge(df_left, df_right, on=['key2'], how='outer'))
print('---')

print('Merge on key1 INNER')
print(pd.merge(df_left, df_right, on=['key1'], how='inner'))
print('---')
print('Merge on key2 INNER')

print(pd.merge(df_left, df_right, on=['key2'], how='inner'))

## Inner-Outer make sense if there are NON combo matches


    key1    key2  left_data
0  Monte  Centro         10
1  Trevi  Centro         20
2  Signo  Centro         30
    key1    key2  right_data
0  Monte  Centro          10
1  Monte    Peri          50
2  Trevi  Centro          20
3  Trevi    Peri          70
Merge on Key one and Key two OUTER
    key1    key2  left_data  right_data
0  Monte  Centro       10.0        10.0
1  Trevi  Centro       20.0        20.0
2  Signo  Centro       30.0         NaN
3  Monte    Peri        NaN        50.0
4  Trevi    Peri        NaN        70.0
---
Merge on two keys INNER
    key1    key2  left_data  right_data
0  Monte  Centro         10          10
1  Trevi  Centro         20          20
---
Merge on key1 OUTER
    key1  key2_x  left_data  key2_y  right_data
0  Monte  Centro         10  Centro        10.0
1  Monte  Centro         10    Peri        50.0
2  Trevi  Centro         20  Centro        20.0
3  Trevi  Centro         20    Peri        70.0
4  Signo  Centro         30     NaN         NaN
---
Merg

### Merge on Index

In [25]:
# Dframe on left
df_left = DataFrame({'key1': ['Monte', 'Trevi', 'Signo'],
                  'key2': ['Centro', 'Centro', 'Centro'],
                  'left_data': [10,20,30]})

print(df_left)
#Dframe on right
df_right = DataFrame({'key1': ['Monte', 'Monte', 'Trevi', 'Trevi'],
                   'key2': ['Centro', 'Peri', 'Centro', 'Peri'],
                   'right_data': [10,50,20,70]})

df_right2 = df_right.copy()
print(df_right)
df_right2 = df_right2.set_index('key1')
print(df_right2)

print(pd.merge(df_left, df_right, on='key1', how='inner'))

print(pd.merge(df_left, df_right2, on='key1', right_index=True, how='outer'))

    key1    key2  left_data
0  Monte  Centro         10
1  Trevi  Centro         20
2  Signo  Centro         30
    key1    key2  right_data
0  Monte  Centro          10
1  Monte    Peri          50
2  Trevi  Centro          20
3  Trevi    Peri          70
         key2  right_data
key1                     
Monte  Centro          10
Monte    Peri          50
Trevi  Centro          20
Trevi    Peri          70
    key1  key2_x  left_data  key2_y  right_data
0  Monte  Centro         10  Centro          10
1  Monte  Centro         10    Peri          50
2  Trevi  Centro         20  Centro          20
3  Trevi  Centro         20    Peri          70
    key1  key2_x  left_data  key2_y  right_data
0  Monte  Centro         10  Centro        10.0
0  Monte  Centro         10    Peri        50.0
1  Trevi  Centro         20  Centro        20.0
1  Trevi  Centro         20    Peri        70.0
2  Signo  Centro         30     NaN         NaN


In [31]:
# More complex example

#Now let's try something a little more complicated, remember hierarchal index?
df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'key2': [10, 20, 30, 40, 50],
                   'data_set': [1000, 2000, 3000, 4000, 5000]})
df_right_hr = DataFrame(np.arange(10).reshape((5, 2)),
                   index=[['LA','LA','SF','SF','SF'],
                          [40, 10, 10, 10, 20]],
                   columns=['col_1', 'col_2'])

print(df_left_hr)
print(df_right_hr)

print(pd.merge(df_left_hr, df_right_hr, left_on=['key1', 'key2'], right_index=True))

  key1  key2  data_set
0   SF    10      1000
1   SF    20      2000
2   SF    30      3000
3   LA    40      4000
4   LA    50      5000
       col_1  col_2
LA 40      0      1
   10      2      3
SF 10      4      5
   10      6      7
   20      8      9
  key1  key2  data_set  col_1  col_2
0   SF    10      1000      4      5
0   SF    10      1000      6      7
1   SF    20      2000      8      9
3   LA    40      4000      0      1


### Concatenate

In [43]:
# Concatenate matrix on different axis
arr1 = np.arange(9).reshape(3,3)

# This is how to concatenate two matri`xes
print(np.concatenate([arr1,arr1],axis=1))

print(np.concatenate([arr1,arr1],axis=0))

[[0 1 2 0 1 2]
 [3 4 5 3 4 5]
 [6 7 8 6 7 8]]
[[0 1 2]
 [3 4 5]
 [6 7 8]
 [0 1 2]
 [3 4 5]
 [6 7 8]]


In [5]:
# Concatenate two series
ser1 = Series([0,1,2], index =['A','B','C'])
ser2 = Series([10,20], index =['A','D'])

# Concatenate with markers
print(pd.concat([ser1,ser2], keys=['ser1', 'ser2'], axis=0))
# Concatenate the two series to obtain a dataframe
print(pd.concat([ser1,ser2],  axis=1))
print(pd.concat([ser1,ser2], keys=['ser1', 'ser2'],  axis=1))

ser1  A     0
      B     1
      C     2
ser2  A    10
      D    20
dtype: int64
     0     1
A  0.0  10.0
B  1.0   NaN
C  2.0   NaN
D  NaN  20.0
   ser1  ser2
A   0.0  10.0
B   1.0   NaN
C   2.0   NaN
D   NaN  20.0


In [6]:
# Concatenate two dataframes
dframe1 = DataFrame(np.random.randn(4,3), columns=['X','Y','Z'])
print(dframe1)
dframe2 = DataFrame(np.random.randn(3,3), columns=['X','Y','Q'])
print(dframe2)
# ignore index is to get a new progressive index
print(pd.concat([dframe1,dframe2], ignore_index=True))

print(pd.concat([dframe1,dframe2], ignore_index=False))

          X         Y         Z
0 -0.987562  0.247725 -1.635256
1 -1.365617  0.989942  1.260238
2 -0.185671 -0.125520  0.613887
3  0.891886  0.083698 -1.877206
          X         Y         Q
0 -0.613227  0.486446 -0.098066
1  1.378422  0.417419  0.552609
2  0.959936 -0.316783  0.121617
          X         Y         Z         Q
0 -0.987562  0.247725 -1.635256       NaN
1 -1.365617  0.989942  1.260238       NaN
2 -0.185671 -0.125520  0.613887       NaN
3  0.891886  0.083698 -1.877206       NaN
4 -0.613227  0.486446       NaN -0.098066
5  1.378422  0.417419       NaN  0.552609
6  0.959936 -0.316783       NaN  0.121617
          X         Y         Z         Q
0 -0.987562  0.247725 -1.635256       NaN
1 -1.365617  0.989942  1.260238       NaN
2 -0.185671 -0.125520  0.613887       NaN
3  0.891886  0.083698 -1.877206       NaN
0 -0.613227  0.486446       NaN -0.098066
1  1.378422  0.417419       NaN  0.552609
2  0.959936 -0.316783       NaN  0.121617


### Combining DF

combine works on INDEXES

In [55]:
# Combining Series
ser1 = Series([1,2,3,np.nan,np.nan,6],
             index=['A','B','C','D','E','F'])

ser2 = Series([100,200,300,400,500,600],
             index=['A','B','C','D','E','F'])

print(Series(np.where(pd.isnull(ser1),ser2,ser1), index=ser1.index)
     )
print(ser1.combine_first(ser2)
     )

A      1.0
B      2.0
C      3.0
D    400.0
E    500.0
F      6.0
dtype: float64
A      1.0
B      2.0
C      3.0
D    400.0
E    500.0
F      6.0
dtype: float64


In [7]:
# Combining Data Frames

#! It fills the gaps in teh first Dataframe with the values in the second

df_left_hr = DataFrame({'key1': ['SF','SF','SF','LA','LA'],
                   'col_1': [10, 20, 30, np.nan, 50],
                   'col_2': [1000, 2000, 3000, 4000, 5000]})


df_right_hr = DataFrame({'key1': ['SF','LA','LA','LA','LA'],
                   'col_1': [10, np.nan, np.nan, np.nan, 50],
                   'col_2': [100, 200, 300, 400, 500]})



print(df_left_hr)
print(df_right_hr)
print(df_left_hr.combine_first(df_right_hr))



  key1  col_1  col_2
0   SF   10.0   1000
1   SF   20.0   2000
2   SF   30.0   3000
3   LA    NaN   4000
4   LA   50.0   5000
  key1  col_1  col_2
0   SF   10.0    100
1   LA    NaN    200
2   LA    NaN    300
3   LA    NaN    400
4   LA   50.0    500
  key1  col_1  col_2
0   SF   10.0   1000
1   SF   20.0   2000
2   SF   30.0   3000
3   LA    NaN   4000
4   LA   50.0   5000


### Reshaping

Basically turning from long to wide format.   
From Pandas I get a Series, if I unstuck I get back a Pandas.    
By unstacking for diferent columns I can retunr to wide format with a different column.   

In [8]:
#Let's see how stack and unstack work

# Create DataFrame
dframe1 = DataFrame(np.arange(8).reshape((2, 4)),
                 index=pd.Index(['LA', 'SF'], name='city'),
                 columns=pd.Index(['A', 'B', 'C','D'], name='letter'))
#Show
print(dframe1)

letter  A  B  C  D
city              
LA      0  1  2  3
SF      4  5  6  7


In [15]:
dframe_st = dframe1.stack()
print(dframe_st)
type(dframe_st)
print(dframe_st.unstack())
print(dframe_st.unstack('city'))

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int32
letter  A  B  C  D
city              
LA      0  1  2  3
SF      4  5  6  7
city    LA  SF
letter        
A        0   4
B        1   5
C        2   6
D        3   7


In [None]:
dframe_st = dframe1.stack()
dframe_st
type(dframe_st)

### Pivoting



In [2]:
# Lets create some data to play with:

# Note: It is not necessary to understand how this dataset was made to understand this Lecture.

#import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())

  


In [3]:
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,-0.521677
1,2000-01-04,A,0.759259
2,2000-01-05,A,-0.085330
3,2000-01-06,A,0.721594
4,2000-01-07,A,-1.397962
...,...,...,...
115,2000-02-07,D,0.714316
116,2000-02-08,D,-0.219693
117,2000-02-09,D,1.649881
118,2000-02-10,D,-0.306279


In [4]:
# Now let's pivot the data

# First two value spassed are teh row and column indexes, then finally an optional fill value
dframe_piv = dframe.pivot('date','variable','value')

#Show
print(dframe_piv)

dframe_piv.to_csv('./in_out/out_test_piv1.csv')

variable           A         B         C         D
date                                              
2000-01-03 -0.521677 -0.711643  0.116085 -1.924481
2000-01-04  0.759259 -0.870730 -2.473561 -1.345243
2000-01-05 -0.085330 -1.308282  0.642622  0.606811
2000-01-06  0.721594  0.818925 -1.831232 -0.143633
2000-01-07 -1.397962  0.917652 -0.567233  0.279183
2000-01-10 -0.217547 -0.023237  0.582158 -0.910210
2000-01-11 -1.261291 -1.620958 -0.545936 -0.769461
2000-01-12 -0.120402  2.273398  0.198328  0.945378
2000-01-13 -1.197004 -0.436398 -0.075036  0.877579
2000-01-14 -0.334776 -0.133706  0.859413 -1.165871
2000-01-17 -0.671377  0.234078 -1.273001  0.686825
2000-01-18 -1.401051 -1.879911 -1.445234 -1.002718
2000-01-19  0.615876  0.237788 -0.395292  0.294778
2000-01-20  0.862061 -0.207441  1.464805 -0.506530
2000-01-21 -1.335949 -1.879542  1.721499  0.205185
2000-01-24 -0.362738 -1.376768 -0.404514  0.629297
2000-01-25 -0.468951 -0.601086  2.169211 -0.058185
2000-01-26 -0.266152 -1.597993 

In [11]:
# Now let's pivot the data
dframe2 = dframe.copy()
dframe2['emptycol'] = "pivot_field"
print(dframe2)

# First two value spassed are teh row and column indexes, then finally an optional fill value
dframe_piv_2 = dframe2.pivot(index=['date','variable'],
                            columns='emptycol',
                            values='value')

#Show
print(dframe_piv_2)

dframe_piv_2.to_csv('./in_out/out_test_piv2.csv')

          date variable     value     emptycol
0   2000-01-03        A -0.521677  pivot_field
1   2000-01-04        A  0.759259  pivot_field
2   2000-01-05        A -0.085330  pivot_field
3   2000-01-06        A  0.721594  pivot_field
4   2000-01-07        A -1.397962  pivot_field
..         ...      ...       ...          ...
115 2000-02-07        D  0.714316  pivot_field
116 2000-02-08        D -0.219693  pivot_field
117 2000-02-09        D  1.649881  pivot_field
118 2000-02-10        D -0.306279  pivot_field
119 2000-02-11        D  1.421848  pivot_field

[120 rows x 4 columns]
emptycol             pivot_field
date       variable             
2000-01-03 A           -0.521677
           B           -0.711643
           C            0.116085
           D           -1.924481
2000-01-04 A            0.759259
...                          ...
2000-02-10 D           -0.306279
2000-02-11 A           -0.111715
           B            0.002870
           C            0.054503
           D    

### Duplicates in DFs

In [12]:
#Lets get a dataframe with duplicates

dframe = DataFrame({'key1': ['A'] * 2 + ['B'] * 3,
                  'key2': [2, 2, 2, 3, 3]})

print(dframe)

  key1  key2
0    A     2
1    A     2
2    B     2
3    B     3
4    B     3


In [16]:
#We can use duplicated to find duplicates
print(dframe.duplicated())

# We can also drop duplicates like this:
print(dframe.drop_duplicates())

#You can filter which duplicates to drop by a single column
print(dframe.drop_duplicates(['key1']))

#By default the first value was taken for the duplicates, we can also take the last value instead
print(dframe.drop_duplicates(['key1'],keep='last'))


0    False
1     True
2    False
3    False
4     True
dtype: bool
  key1  key2
0    A     2
2    B     2
3    B     3
  key1  key2
0    A     2
2    B     2
  key1  key2
1    A     2
4    B     3


### Mapping

Add columns with values in a dictionary.   
Kind of a VLOOKUP   

In [17]:
# Let's create a dframe to work with (Highest elevation cities in USA)
dframe = DataFrame({'city':['Alma','Brian Head','Fox Park'],
                    'altitude':[3158,3000,2762]})

#Now let's say we wanted to add a column for the States, we can do that with a mapping.
state_map={'Alma':'Colorado','Brian Head':'Utah','Fox Park':'Wyoming'}

#Show
print(dframe)
print(state_map)

         city  altitude
0        Alma      3158
1  Brian Head      3000
2    Fox Park      2762
{'Alma': 'Colorado', 'Brian Head': 'Utah', 'Fox Park': 'Wyoming'}


In [19]:
# Now we can map that data to our current dframe
dframe['state'] = dframe['city'].map(state_map)

print(dframe)

         city  altitude     state
0        Alma      3158  Colorado
1  Brian Head      3000      Utah
2    Fox Park      2762   Wyoming


### Replace

In [21]:
# Lets make  Series
ser1 = Series([1,2,3,4,1,2,3,4])
#Show
print(ser1)


# Using replace we can select --> .replace(value to be replaced, new_value)
print(ser1.replace(1,np.nan))

#Can also input lists
print(ser1.replace([1,4],[100,400]))

#Can also input dictionary
print(ser1.replace({4:np.nan}))


0    1
1    2
2    3
3    4
4    1
5    2
6    3
7    4
dtype: int64
0    NaN
1    2.0
2    3.0
3    4.0
4    NaN
5    2.0
6    3.0
7    4.0
dtype: float64
0    100
1      2
2      3
3    400
4    100
5      2
6      3
7    400
dtype: int64
0    1.0
1    2.0
2    3.0
3    NaN
4    1.0
5    2.0
6    3.0
7    NaN
dtype: float64


### Lec 38 - Rename Index using MAPPING

In [22]:
# Making a DataFrame
dframe= DataFrame(np.arange(12).reshape((3, 4)),
                 index=['NY', 'LA', 'SF'],
                 columns=['A', 'B', 'C', 'D'])

#Show
print(dframe)

    A  B   C   D
NY  0  1   2   3
LA  4  5   6   7
SF  8  9  10  11


In [23]:
#Let's use map to lowercase the city initials
print(dframe.index.map(str.lower))

# If you want to assign this to the actual index, you can use index
# dframe.index = dframe.index.map(str.lower)


# Use rename if you want to create a transformed version
# WHITOUT modifying the original!

#str.title will capitalize the first letter, lowercasing the columns
dframe.rename(index=str.title, columns=str.lower)


# We can also use rename with dictionaries providing new values for indexes or columns!
# inplace=True actually edit the data
dframe.rename(index={'ny': 'NEW YORK'},
            columns={'A': 'ALPHA'}, inplace=True)

print(dframe)

Index(['ny', 'la', 'sf'], dtype='object')
    ALPHA  B   C   D
NY      0  1   2   3
LA      4  5   6   7
SF      8  9  10  11


### Binning

In [4]:
years = [1990,1991,1992,2008,2012,2015,1987,1969,2013,2008,1999]
# We can seperate these years by decade
decade_bins = [1960,1970,1980,1990,2000,2010,2020]

#Now we'll use cut to get somethign called a Category object
decade_cat = pd.cut(years,decade_bins)

print(type(decade_cat))
print(decade_cat)
# See the categories
print(type(decade_cat.categories))
print(decade_cat.categories)

<class 'pandas.core.arrays.categorical.Categorical'>
[(1980, 1990], (1990, 2000], (1990, 2000], (2000, 2010], (2010, 2020], ..., (1980, 1990], (1960, 1970], (2010, 2020], (2000, 2010], (1990, 2000]]
Length: 11
Categories (6, interval[int64]): [(1960, 1970] < (1970, 1980] < (1980, 1990] < (1990, 2000] < (2000, 2010] < (2010, 2020]]
<class 'pandas.core.indexes.interval.IntervalIndex'>
IntervalIndex([(1960, 1970], (1970, 1980], (1980, 1990], (1990, 2000], (2000, 2010], (2010, 2020]],
              closed='right',
              dtype='interval[int64]')


### Outliers

In [6]:
# Let's see how we would find outliers in a dataset

# First we'll seed the numpy generator
np.random.seed(12345)

#Next we'll create the dataframe
dframe = DataFrame(np.random.randn(1000,4))

dframe.head(5)

Unnamed: 0,0,1,2,3
0,-0.204708,0.478943,-0.519439,-0.55573
1,1.965781,1.393406,0.092908,0.281746
2,0.769023,1.246435,1.007189,-1.296221
3,0.274992,0.228913,1.352917,0.886429
4,-2.001637,-0.371843,1.669025,-0.43857


In [7]:
# Get statistics on the DF 
# As you can see there are values lower/greater than 3
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067684,0.067924,0.025598,-0.002298
std,0.998035,0.992106,1.006835,0.996794
min,-3.428254,-3.548824,-3.184377,-3.745356
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.366626,2.653656,3.260383,3.927528


In [11]:
col = dframe[0]
# Lookng at values > 3 in DF column 1 
col[np.abs(col)>3]

# Check at the entire DF - return any row with value greater than 3
dframe[(np.abs(dframe)>3).any(1)]

Unnamed: 0,0,1,2,3
5,-0.539741,0.476985,3.248944,-1.021228
97,-0.774363,0.552936,0.106061,3.927528
102,-0.655054,-0.56523,3.176873,0.959533
305,-2.315555,0.457246,-0.025907,-3.399312
324,0.050188,1.951312,3.260383,0.963301
400,0.146326,0.508391,-0.196713,-3.745356
499,-0.293333,-0.242459,-3.05699,1.918403
523,-3.428254,-0.296336,-0.439938,-0.867165
586,0.275144,1.179227,-3.184377,1.369891
808,-0.362528,-3.548824,1.553205,-2.186301


In [15]:
# Capping the DF
# any value whihc is greater/smaller than 3 or -3 is replaced 
# with the value 3 times the sign +/-.
dframe[np.abs(dframe)>3] = np.sign(dframe)*3

In [14]:
dframe.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.067623,0.068473,0.025153,-0.002081
std,0.995485,0.990253,1.003977,0.989736
min,-3.0,-3.0,-3.0,-3.0
25%,-0.77489,-0.591841,-0.641675,-0.644144
50%,-0.116401,0.101143,0.002073,-0.013611
75%,0.616366,0.780282,0.680391,0.654328
max,3.0,2.653656,3.0,3.0


### Permutation ?

In [16]:
# Let imagine a box with 3 marbles in it: labeled 1, 2, and 3
box = np.array([1,2,3])
print(box)

# Now lets create a random permuation WITH replacement using randint
shaker = np.random.randint(0, len(box), size=10)
print(shaker)

[1 2 3]
[0 2 2 2 0 2 0 2 0 2]


# SUMMARY - RECAP

- Merge
- Merge on Index
- Concatenate
- Combining DF
- Reshaping
- Pivoting
- Duplicates in DFs
- Mapping
- Replace
- Rename Index
- Binning
- Outliers
- Permutation