In [2]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame({'Sex':['M','F','F','D','?'],
                  'Age':[29,30,24,290,25]})

In [3]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,290
4,?,25


In [4]:
df['Sex'].unique()

array(['M', 'F', 'D', '?'], dtype=object)

In [5]:
df['Sex'].replace('D', 'F')

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [7]:
df['Sex'].replace({'D': 'F', 'N': 'M'})

0    M
1    F
2    F
3    F
4    ?
Name: Sex, dtype: object

In [9]:
df.replace({'Sex':{'D':'F', 'N':'M'},'Age':{290:29}})

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,F,29
4,?,25


In [10]:
df[df['Age'] > 100]

Unnamed: 0,Sex,Age
3,D,290


In [11]:
df.loc[df['Age'] > 100, 'Age'] = df.loc[df['Age'] > 100, 'Age'] / 10 

In [12]:
df

Unnamed: 0,Sex,Age
0,M,29
1,F,30
2,F,24
3,D,29
4,?,25


## Duplicates

In [4]:
ambassaddors = pd.Series(['France','Italy','United Kingdom','United Kingdom','Germany','Germany','Germany']
                        , index = ['Gerald Ara','Kim Darro','Peter Wes','Armando Varri','Peter Witti','Peter Ammon','Klaus Sc'])

In [5]:
ambassaddors

Gerald Ara               France
Kim Darro                 Italy
Peter Wes        United Kingdom
Armando Varri    United Kingdom
Peter Witti             Germany
Peter Ammon             Germany
Klaus Sc                Germany
dtype: object

In [7]:
ambassaddors.duplicated()

Gerald Ara       False
Kim Darro        False
Peter Wes        False
Armando Varri     True
Peter Witti      False
Peter Ammon       True
Klaus Sc          True
dtype: bool

In [8]:
ambassaddors.duplicated(keep = 'last')

Gerald Ara       False
Kim Darro        False
Peter Wes         True
Armando Varri    False
Peter Witti       True
Peter Ammon       True
Klaus Sc         False
dtype: bool

In [10]:
ambassaddors.duplicated(keep = False)

Gerald Ara       False
Kim Darro        False
Peter Wes         True
Armando Varri     True
Peter Witti       True
Peter Ammon       True
Klaus Sc          True
dtype: bool

## Duplivates in DataFrames

In [11]:
players = pd.DataFrame({
'Name': [
'Kobe Bryant',
'LeBron James',
'Kobe Bryant',
'Carmelo Anthony',
'Kobe Bryant',
],
    'Pos':['SG','SF','SG','SF','SF']})

In [12]:
players

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
2,Kobe Bryant,SG
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [16]:
players.duplicated()

0    False
1    False
2     True
3    False
4    False
dtype: bool

In [17]:
players.duplicated(subset = ['Name'])

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [18]:
players.duplicated(subset = ['Name'], keep = 'last')

0     True
1    False
2     True
3    False
4    False
dtype: bool

In [20]:
players.drop_duplicates()

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


In [21]:
players.drop_duplicates(subset = ['Name'])

Unnamed: 0,Name,Pos
0,Kobe Bryant,SG
1,LeBron James,SF
3,Carmelo Anthony,SF


In [22]:
players.drop_duplicates(subset = ['Name'], keep = 'last')

Unnamed: 0,Name,Pos
1,LeBron James,SF
3,Carmelo Anthony,SF
4,Kobe Bryant,SF


## Splitting Columns

In [31]:
df = pd.DataFrame({
'Data': ['1931_M_UK_1','1842_M_ IT_2','1564_F_FR_1','1453_M_TR_5','1985_F_US_1']})

In [32]:
df

Unnamed: 0,Data
0,1931_M_UK_1
1,1842_M_ IT_2
2,1564_F_FR_1
3,1453_M_TR_5
4,1985_F_US_1


In [33]:
df['Data'].str.split('_')

0     [1931, M, UK, 1]
1    [1842, M,  IT, 2]
2     [1564, F, FR, 1]
3     [1453, M, TR, 5]
4     [1985, F, US, 1]
Name: Data, dtype: object

In [34]:
df['Data'].str.split('_', expand = True)

Unnamed: 0,0,1,2,3
0,1931,M,UK,1
1,1842,M,IT,2
2,1564,F,FR,1
3,1453,M,TR,5
4,1985,F,US,1


In [35]:
df = df['Data'].str.split('_', expand = True)

In [37]:
df.columns = ['Year', 'Sex', 'Country', 'No Children']

In [38]:
df

Unnamed: 0,Year,Sex,Country,No Children
0,1931,M,UK,1
1,1842,M,IT,2
2,1564,F,FR,1
3,1453,M,TR,5
4,1985,F,US,1


In [39]:
df['Year'].str.contains('\?')

0    False
1    False
2    False
3    False
4    False
Name: Year, dtype: bool

In [40]:
df['Country'].str.contains('U')

0     True
1    False
2    False
3    False
4     True
Name: Country, dtype: bool

In [41]:
df['Country'].str.strip()

0    UK
1    IT
2    FR
3    TR
4    US
Name: Country, dtype: object

In [43]:
df['Country'].str.replace(' ', '')

0    UK
1    IT
2    FR
3    TR
4    US
Name: Country, dtype: object

In [44]:
df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))

  df['Year'].str.replace(r'(?P<year>\d{4})\?', lambda m: m.group('year'))


0    1931
1    1842
2    1564
3    1453
4    1985
Name: Year, dtype: object