In [3]:
import pandas as pd
import numpy as np

* how to replace data in your DataFrame using `.replace()`
* how to change the data type with `.astype()`
* how to manage duplicated data with `.duplicated()` and `.drop_duplicates()`

In [7]:
df = pd.DataFrame(data={'Col1':np.random.randint(low=0,high=50,size=10),
                        'Col2':np.random.randn(10),
                        'Col3':['A','B','C','A','A','C','D','B','A','A'],
                        'Col4':np.random.choice([True, False],size=10),
                        'Col5':np.random.randint(low=1,high=4,size=10),
                        'Col6':['X','X','Y','Z','Z','Z','Y','X','Z','Z']},
                  index=pd.date_range(start='01/01/2021', periods=10, freq='M'))
df

  index=pd.date_range(start='01/01/2021', periods=10, freq='M'))


Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
2021-01-31,22,-0.839653,A,True,1,X
2021-02-28,10,-0.940987,B,True,3,X
2021-03-31,14,-0.180007,C,False,3,Y
2021-04-30,8,0.277057,A,False,1,Z
2021-05-31,39,0.103807,A,False,1,Z
2021-06-30,4,-0.183913,C,False,3,Z
2021-07-31,2,1.878489,D,False,2,Y
2021-08-31,29,-0.6226,B,True,1,X
2021-09-30,27,-1.504553,A,False,2,Z
2021-10-31,2,0.965521,A,True,2,Z


## Replace values

In [9]:
# Replace values in your dataframe using .replace().
# Step - 1


map_dict = {'A': 'Nice', 'B': 'Super', 'C':'Amazing', 'D':'Incredible'}
map_dict

{'A': 'Nice', 'B': 'Super', 'C': 'Amazing', 'D': 'Incredible'}

In [11]:
# You can parse a Python dictionary that will map the values in the 'to_replace' argument
# Step - 2

df['Col3'].replace(to_replace=map_dict, inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Col3'].replace(to_replace=map_dict, inplace=True)


Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
2021-01-31,22,-0.839653,Nice,True,1,X
2021-02-28,10,-0.940987,Super,True,3,X
2021-03-31,14,-0.180007,Amazing,False,3,Y
2021-04-30,8,0.277057,Nice,False,1,Z
2021-05-31,39,0.103807,Nice,False,1,Z
2021-06-30,4,-0.183913,Amazing,False,3,Z
2021-07-31,2,1.878489,Incredible,False,2,Y
2021-08-31,29,-0.6226,Super,True,1,X
2021-09-30,27,-1.504553,Nice,False,2,Z
2021-10-31,2,0.965521,Nice,True,2,Z


In [33]:
# Alternate option of Replacing

df['Col6'].replace(to_replace={'X': 1, 'Y': 20, 'Z':4000}, inplace=True)
df

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
2021-01-31,34,-1.526566,A,True,3,1
2021-02-28,26,0.272796,B,False,2,1
2021-03-31,7,-0.351652,C,True,3,20
2021-04-30,24,-1.483216,A,False,3,4000
2021-05-31,38,-1.516202,A,True,3,4000
2021-06-30,2,0.054614,C,True,1,4000
2021-07-31,43,-0.031262,D,True,1,20
2021-08-31,26,-0.674283,B,True,3,1
2021-09-30,9,0.178446,A,False,3,4000
2021-10-31,24,0.963237,A,False,3,4000


In [51]:
df.dtypes

Col1      int64
Col2    float64
Col3     object
Col4       bool
Col5      int64
Col6      int64
dtype: object

## Change datatype - `.astype()`

In [64]:
df.head(3)

Unnamed: 0,Col1,Col2,Col3,Col4,Col5,Col6
2021-01-31,34,-1.526566,Nice,True,3,1
2021-02-28,26,0.272796,Super,False,2,1
2021-03-31,7,-0.351652,Amazing,True,3,20


In [74]:
df.dtypes

Col1      int64
Col2    float64
Col3     object
Col4       bool
Col5     object
Col6      int64
dtype: object

In [80]:
# Change needs to be saved in DataFrame, else it will not be changed

df = df.astype({'Col5':'object'})
df.dtypes

Col1      int64
Col2    float64
Col3     object
Col4       bool
Col5     object
Col6      int64
dtype: object

## Managing duplicated data

In [13]:
df1 = pd.DataFrame(data={'Name':['John','Marie','Joseph','John','Willian','Marie'],
                        'CustomerRate':[1,2,2,4,3,3]}
                  )
df1

Unnamed: 0,Name,CustomerRate
0,John,1
1,Marie,2
2,Joseph,2
3,John,4
4,Willian,3
5,Marie,3


### `DataFrame.duplicated(subset=None, keep='first')`
    - subset = 'Name to find duplicates in the column: Name, as an example
    - keep = {‘first’, ‘last’, False}, default ‘first’

It is decided for this DataFrame that each Name should be unique. For some reason, John and Marie are repeated, 
even though the CustomerRate is different.

However, in this case, it should have only 1 John and only 1 Marie.
- Use `.duplicated()` to grab rows which are repeated in the column you are interested in.
- Use the subset argument to only consider certain columns for identifying duplicates. By default, it uses all columns to check for 
duplication,but in this case, we are interested only in 'Name'.

In [19]:
# DataFrame.duplicated(subset=None, keep='first')
    # subset - column label or sequence of labels, optional
    # keep - {‘first’, ‘last’, False}, default ‘first’
df1['Name'].duplicated().sum()
#df1.duplicated(subset=['Name']).sum()

2

In [104]:
# To get the duplicated data in a dataframe

df1[df1.duplicated(subset=['Name'])]

Unnamed: 0,Name,CustomerRate
3,John,4
5,Marie,3


In [110]:
# You can drop duplicate rows with the method .drop_duplicates
# DataFrame.drop_duplicates(subset=None, *, keep='first', inplace=False, ignore_index=False)

df1.drop_duplicates(subset=['Name'], inplace=True)
df1

Unnamed: 0,Name,CustomerRate
0,John,1
1,Marie,2
2,Joseph,2
4,Willian,3
