In [1]:
import pandas as pd 
import numpy as np

In [2]:
df = pd.DataFrame({
    'Sex':['M','F','F','M','D','?'],
    'Age':[29,30,24,290,25,67]
})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,M,290
4,D,25
5,?,67


In [6]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [5]:
df['Sex'].value_counts()

F    2
M    2
?    1
D    1
Name: Sex, dtype: int64

## replace single value


In [14]:

df['Sex'].replace('D','F')

0    M
1    F
2    F
3    M
4    F
5    ?
Name: Sex, dtype: object

## replace multiple  values using __DICT__ format


In [10]:
df['Sex'].replace({'D':'F','?':'M'})

0    M
1    F
2    F
3    M
4    F
5    M
Name: Sex, dtype: object

### Replace values in multiple columns 

In [16]:
cleaned_data = df.replace({
    'Sex':{'D':'F','?':'M'},
    'Age':{290:29}
})

In [17]:
cleaned_data

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,M,29
4,F,25
5,M,67


In [22]:
df[df['Age'] > 120]

Unnamed: 0,Sex,Age
3,M,290


In [23]:
df.loc[df['Age'] > 120,'Age' ] = df.loc[df['Age'] > 120,'Age' ] / 10

In [24]:
df

Unnamed: 0,Sex,Age
0,M,29.0
1,F,30.0
2,F,24.0
3,M,29.0
4,D,25.0
5,?,67.0


## How to identify __DUPLICATE__ values?? using __duplicated()__ method.

##### __duplicated()__ method help to identify the duplicate values 

In [25]:
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
     'Germany',
     'Germany'
    
], index = [
    'Gerard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Witting',
    'Peter Ammon',
    'Klaus Scharioth'
    
])

In [26]:
ambassadors

Gerard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Witting                Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [29]:
ambassadors.duplicated()
# this will keep the first one and treat the others same value as duplicated 

Gerard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Witting         False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [31]:
ambassadors.duplicated(keep ='last')
# this will keep the last one  and treat the others same value as duplicated 

Gerard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Witting          True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

## How to remove __DUPLICATE__ values?? using __drop_duplicates()__ method.


#### __drop_duplicates()__ 

In [35]:
## default
ambassadors.drop_duplicates()

Gerard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Witting                Germany
dtype: object

In [36]:

## with precision

ambassadors.drop_duplicates(keep ='last')

Gerard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [40]:
players = pd.DataFrame({
    'Name':[
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Camelo Anthony',
        'Kobe Bryant'
    ],
    'Post':['SG',
            'SF',
            'SG',
            'SF',
            'SF'
        
    ]
})

In [41]:
players

Unnamed: 0,Name,Post
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Camelo Anthony,SF
4,Kobe Bryant,SF


In [45]:
players.duplicated(subset = ['Name'], keep ='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [46]:
players.drop_duplicates()

Unnamed: 0,Name,Post
0,Kobe Bryant,SG
1,LeBron James,SF
3,Camelo Anthony,SF
4,Kobe Bryant,SF


In [47]:
clearn_players_data = players.drop_duplicates()

In [48]:
clearn_players_data.duplicated(subset = ['Name'], keep ='last')

0     True
1    False
3    False
4    False
dtype: bool

In [49]:
clearn_players_data

Unnamed: 0,Name,Post
0,Kobe Bryant,SG
1,LeBron James,SF
3,Camelo Anthony,SF
4,Kobe Bryant,SF


# __String__ Handling when we are cleaning data __steps__

In [50]:
## step1 : getting the DataFrame

my_data = pd.DataFrame({
    'Data':[
        '1987_M_US_1',
        '1990_M_UK_1',
        '1992_F_US_1',
        '1970_M_IT_1',
        '1985_F_IT_2'
    ]
})

In [51]:

## step2:  display the DataFrame and check the format

my_data


Unnamed: 0,Data
0,1987_M_US_1
1,1990_M_UK_1
2,1992_F_US_1
3,1970_M_IT_1
4,1985_F_IT_2


#### You know the single column represent the value of __'Year, Sex,country and number of children'__ .
#### But it's all been grouped in the same couluns and separed by underscore. pandas has 
#### a convenient method named __split()__ that we can use in the situations to fix data format.

In [53]:
## step3: By the format choose how to modify the format

my_data['Data'].str.split('_')

0    [1987, M, US, 1]
1    [1990, M, UK, 1]
2    [1992, F, US, 1]
3    [1970, M, IT, 1]
4    [1985, F, IT, 2]
Name: Data, dtype: object

In [54]:
## step4: expand use expand and store it into the same or different variable

my_data = my_data['Data'].str.split('_', expand = True)

In [55]:
my_data

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990,M,UK,1
2,1992,F,US,1
3,1970,M,IT,1
4,1985,F,IT,2


In [56]:
## step5: named the colums

my_data.columns = ['Year', 'Sex', 'Country', 'No Children']

In [57]:
my_data

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990,M,UK,1
2,1992,F,US,1
3,1970,M,IT,1
4,1985,F,IT,2


In [58]:
my_data['Country'].str.strip()

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

## stoped 03:13:50