Cleaning Not-Null values
    - Sometimes, you can have invalid values that are not just missing data like None or Nan

In [1]:
import numpy as np
import pandas as pd 

In [2]:
df = pd.DataFrame({
    'Sex': ['M', 'F', 'F', 'D', '?'],
    'Age': [29, 30, 24, 290, 25]
})

# above DataFrame does not have missing values but does have invalid data 
# 290 does not seem like a valid age 
# sex only takes M or F

df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


Finding Unique Values 
    - After noticing, identify the values to handle them and replace them etc
    - When it comes to a "category" of a field like Sex -> only takes M or F
      - Start by analyzing the variety of values present 
        - unique() method

In [3]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [4]:
df['Sex'].value_counts()


# we only expect F or M so seeing D and ? would be questionable
#  use the replace function to replace these values:


Sex
F    2
M    1
D    1
?    1
Name: count, dtype: int64

In [5]:
#  use the replace function to replace these values:

df['Sex'].replace('D', 'F')

# replaces any D with F

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [6]:
# also accepts a dictionary of values to replace
# For example, they also told you that there might be a few 'N's, that should actually be 'M's:

df['Sex'].replace({'D': 'F', 'N': 'M'})

# replaces any D with F and any N with M

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [7]:
# if many columns to replace, can apply it at "DataFrame level":

df.replace({
    'Sex': {
        'D': 'F',
        'N': 'M'
    },
    'Age': {
        290: 29
    }
})

# replaces D, N and 290 with correct values

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [8]:
# remove all the extra 0s from the ages columns
# (example, 150 > 15, 490 > 49).

# first set the limit of the "not possible" age
# then combine boolean selection with the operation:

df[df['Age'] > 100]  # anything about 100 is not possible

Unnamed: 0,Sex,Age
3,D,290


In [9]:
# now just divide by 10:

df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10

# anything above 100 will be divided by 10 

In [10]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


Duplicates
    - Checking duplicates behaves differently between Series and DataFrames
    Two most important methods to deal with duplicates
        1) duplicated (that will tell you which values are duplicates)
        2) drop_duplicates (which will just get rid of duplicates)

In [13]:
# Series example: throwing a fancy party and inviting Ambassadors from Europe
# can only invite one ambassador per country

# original list. Issue: both the UK and Germany have duplicated ambassadors
ambassadors = pd.Series([
    'France',
    'United Kingdom',
    'United Kingdom',
    'Italy',
    'Germany',
    'Germany',
    'Germany',
], index=[
    'Gérard Araud',
    'Kim Darroch',
    'Peter Westmacott',
    'Armando Varricchio',
    'Peter Wittig',
    'Peter Ammon',
    'Klaus Scharioth '
])

In [14]:
ambassadors

Gérard Araud                  France
Kim Darroch           United Kingdom
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
Peter Ammon                  Germany
Klaus Scharioth              Germany
dtype: object

In [15]:
# Two most important methods to deal with duplicates
# 1) duplicated (that will tell you which values are duplicates)
# 2) drop_duplicates (which will just get rid of duplicates)

ambassadors.duplicated()

Gérard Araud          False
Kim Darroch           False
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig          False
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [16]:
# Case: duplicated didn't consider 'Kim Darroch', the first instance of the United Kingdom or 'Peter Wittig' as duplicates
# Because, by default, it'll consider the first occurrence of the value as not-duplicate
# Change this behavior with the keep parameter:

ambassadors.duplicated(keep='last')

Gérard Araud          False
Kim Darroch            True
Peter Westmacott      False
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth       False
dtype: bool

In [17]:
# Case: result is "flipped", 
# 'Kim Darroch' and 'Peter Wittig' (the first ambassadors of their countries) are considered duplicates
# but 'Peter Westmacott' and 'Klaus Scharioth' are not duplicates
# So choose to mark all of them as duplicates with keep=False:

ambassadors.duplicated(keep=False)

Gérard Araud          False
Kim Darroch            True
Peter Westmacott       True
Armando Varricchio    False
Peter Wittig           True
Peter Ammon            True
Klaus Scharioth        True
dtype: bool

In [18]:
# Similar method: drop_duplicates
# excludes the duplicated values and also accepts the keep parameter:

ambassadors.drop_duplicates()

Gérard Araud                  France
Kim Darroch           United Kingdom
Armando Varricchio             Italy
Peter Wittig                 Germany
dtype: object

In [19]:
ambassadors.drop_duplicates(keep='last')

Gérard Araud                  France
Peter Westmacott      United Kingdom
Armando Varricchio             Italy
Klaus Scharioth              Germany
dtype: object

In [20]:
ambassadors.drop_duplicates(keep=False)

Gérard Araud          France
Armando Varricchio     Italy
dtype: object

Duplicates in DataFrames
    - Happen a "row" level 
    - Two rows with the same values are considered to be duplicates

In [21]:
players = pd.DataFrame({
    'Name': [
        'Kobe Bryant',
        'LeBron James',
        'Kobe Bryant',
        'Carmelo Anthony',
        'Kobe Bryant',
    ],
    'Pos': [
        'SG',
        'SF',
        'SG',
        'SF',
        'SF'
    ]
})

In [22]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [23]:
# From previous DataFrame, cleary see Kobe is duplicated
# but he appears with two different positions
# in this case duplicated says false...

players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [24]:
# "duplicated" means "all the column values should be duplicates"
# So customize this with the subset parameter:

players.duplicated(subset=['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [25]:
# Same rules of keep still apply:

players.duplicated(subset=['Name'], keep='last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [26]:
# drop_duplicates takes the same parameters:

players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [27]:
players.drop_duplicates(subset=['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [28]:
players.drop_duplicates(subset=['Name'], keep='last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


Text Handling
    - Cleaning text values can be incredibly hard
          - Invalid text values

Splitting Columns

In [29]:
# result of a survey is loaded 

df = pd.DataFrame({
    'Data': [
        '1987_M_US _1',
        '1990?_M_UK_1',
        '1992_F_US_2',
        '1970?_M_   IT_1',
        '1985_F_I  T_2'
]})

In [30]:
df

Unnamed: 0,Data
0,1987_M_US _1
1,1990?_M_UK_1
2,1992_F_US_2
3,1970?_M_ IT_1
4,1985_F_I T_2


In [31]:
# single columns represent the values "year, Sex, Country and number of children"
# but it's all been grouped in the same column and separated by an underscore

# Pandas has a convenient method named split that we can use in these situations:
df['Data'].str.split('_')

0       [1987, M, US , 1]
1       [1990?, M, UK, 1]
2        [1992, F, US, 2]
3    [1970?, M,    IT, 1]
4      [1985, F, I  T, 2]
Name: Data, dtype: object

In [32]:
df['Data'].str.split('_', expand=True)

Unnamed: 0,0,1,2,3
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [33]:
df = df['Data'].str.split('_', expand=True)

In [34]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [35]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1987,M,US,1
1,1990?,M,UK,1
2,1992,F,US,2
3,1970?,M,IT,1
4,1985,F,I T,2


In [36]:
df['Year'].str.contains('\?')

0    False
1     True
2    False
3     True
4    False
Name: Year, dtype: bool

In [37]:
# contains takes a regex/pattern as first value
# so need to escape the ? symbol as it has a special meaning for these patterns

# Regular letters don't need escaping:
df['Country'].str.contains('U')

0     True
1     True
2     True
3    False
4    False
Name: Country, dtype: bool

In [38]:
# Removing blank spaces (like in 'US ' or 'I T') 
# achieved with strip (lstrip and rstrip also exist) or just replace:

df['Country'].str.strip()

0      US
1      UK
2      US
3      IT
4    I  T
Name: Country, dtype: object

In [39]:
df['Country'].str.replace(' ', '')

0    US
1    UK
2    US
3    IT
4    IT
Name: Country, dtype: object

In [49]:
# replace and contains take regex patterns
# can make it easier to replace values in bulk:

df['Year'] = df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'), regex=True)