https://www.datacamp.com/courses/cleaning-data-in-python
# Question: for Line 148, why the number size column is different from its original data?

In [139]:
import pandas as pd
import re
import numpy as np

In [140]:
tips = pd.read_csv('datasets/tips.csv')
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_dollar
0,16.99,1.01,Female,No,Sun,Dinner,2,$16.99
1,10.34,1.66,Male,No,Sun,Dinner,3,$10.34
2,21.01,3.5,Male,No,Sun,Dinner,3,$21.01
3,23.68,3.31,Male,No,Sun,Dinner,2,$23.68
4,24.59,3.61,Female,No,Sun,Dinner,4,$24.59


In [141]:
tips.dtypes

total_bill      float64
tip             float64
sex              object
smoker           object
day              object
time             object
size              int64
total_dollar     object
dtype: object

## 1) Converting object to category: Series = Series.astype('category')
- can make the DataFrame smaller.


In [142]:
tips.sex = tips.sex.astype('category')

In [143]:
tips.dtypes

total_bill       float64
tip              float64
sex             category
smoker            object
day               object
time              object
size               int64
total_dollar      object
dtype: object

## 2) Converting numeric numbers to object: Series = Series.astype(str)

In [144]:
tips.size = tips.size.astype(str)

In [145]:
tips.dtypes

total_bill       float64
tip              float64
sex             category
smoker            object
day               object
time              object
size              object
total_dollar      object
dtype: object

## 3) Converting strings to numeric values
- errors='coerce': if we don't have this argument, then python will return an error, because it will not know how to turn the string, dash, into a numeric value.

In [146]:
tips.size = pd.to_numeric(tips.size, errors='coerce')

In [147]:
tips.dtypes

total_bill       float64
tip              float64
sex             category
smoker            object
day               object
time              object
size               int64
total_dollar      object
dtype: object

In [148]:
tips.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,total_dollar
0,16.99,1.01,Female,No,Sun,Dinner,1952,$16.99
1,10.34,1.66,Male,No,Sun,Dinner,1952,$10.34
2,21.01,3.5,Male,No,Sun,Dinner,1952,$21.01
3,23.68,3.31,Male,No,Sun,Dinner,1952,$23.68
4,24.59,3.61,Female,No,Sun,Dinner,1952,$24.59


# 2. Using regular expressions to clean strings
- import re

In [149]:
# 1) \d*: represents any digit, followed by an asterisk to match it 0 or more times. 12345678901
# 2) \$\d*: $12345678901
# 3) \$\d*\.\d*: the period matches any 1 character. $12345678901.42
# 4) \$\d*\.\d{2}: {2} is to match exactly 2 digits. $12345678901.24
# 5) ^\$\d*\.\d{2}$: The caret will tell the pattern to start the pattern match at the beginning of the value. 
# The dollar sign will tell the pattern to match at the end of the value. $12345678901.999.

In [150]:
pattern = re.compile('\$\d*\.\d{2}')
result = pattern.match('$17.89')

In [151]:
result

<re.Match object; span=(0, 6), match='$17.89'>

In [152]:
bool(result)

True

### Practice 2

In [153]:
phone = re.compile('\d{3}-\d{3}-\d{4}')
result1 = phone.match('123-456-7890')
result1

<re.Match object; span=(0, 12), match='123-456-7890'>

In [154]:
bool(result1)

True

#### re.findall( ): to extract multiple numbers

In [155]:
re.findall('\d+', 'the recipe calls for 6 strawberries and 2 bananas')

['6', '2']

In [156]:
bool(re.match(pattern='\d{3}-\d{3}-\d{4}', string='123-456-7890'))

True

In [157]:
bool(re.match(pattern='\$\d*\.\d{2}', string='$123.45'))

True

#### [A-Z]: to match any capital letter, followed by \w* to match an arbitrary number of alphanumeric characters.

In [158]:
bool(re.match(pattern='[A-Z]\w*', string='Australi23a'))

True

# 3. Using functions to clean data