# 9. DATA CLEANING

# 9.4. CHARACTER ENCODINGS 

# 9.4.1. COURS

## 9.4.1.1. What are encodings?

In [1]:
# modules we'll use
import pandas as pd
import numpy as np

# helpful character encoding module
import chardet

# set seed for reproducibility
np.random.seed(0)

In [2]:
before = "This is the euro symbol: €"

# check to see what datatype it is
type(before)

str

In [4]:
# encode it to a different encoding, replacing characters that raise errors
after = before.encode("utf-8", errors="replace")

# check the type
type(after)

bytes

In [5]:
# take a look at what the bytes look like
after

b'This is the euro symbol: \xe2\x82\xac'

In [7]:
# convert it back to utf-8
print(after.decode("utf-8"))

This is the euro symbol: €


In [8]:
# try to decode our bytes with the ascii encoding
print(after.decode("ascii"))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 25: ordinal not in range(128)

In [9]:
# start with a string
before = "This is the euro symbol: €"

# encode it to a different encoding, replacing characters that raise errors
after = before.encode("ascii", errors = "replace")

# convert it back to utf-8
print(after.decode("ascii"))

# We've lost the original underlying byte string! It's been 
# replaced with the underlying byte string for the unknown character :(

This is the euro symbol: ?


## 9.4.1.2. Reading in files with encoding problems

In [11]:
#  try to read in a file not in UTF-8
kickstarter_path = 'C:/Users/PC Maison/4-KAGGLE/KAGGLE_DEV/KAGGLE_COURS_6-FEATURE_ENGINEERING/kickstarter-projects/input/'
kickstarter_2016 = pd.read_csv(kickstarter_path + 'ks-projects-201612.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte

In [15]:
# look at the first ten thousand bytes to guess the character encoding
with open(kickstarter_path + 'ks-projects-201612.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

# check what the character encoding might be
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [17]:
kickstarter_2016 = pd.read_csv(kickstarter_path + 'ks-projects-201612.csv',
                               encoding='Windows-1252')

# look at the first few lines
kickstarter_2016.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


## 9.4.1.3. Saving your files with UTF-8 encoding

In [18]:
# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv("ks-projects-201801-utf8.csv")

# 9.4.2. EXERCICES

In [20]:
import pandas as pd
import numpy as np

In [21]:
# helpful character encoding module
import chardet

In [22]:
# set seed for reproducibility
np.random.seed(0)

## 9.4.2.1. What are encodings?

In [23]:
sample_entry = b'\xa7A\xa6n'
print(sample_entry)
print('data type:', type(sample_entry))

b'\xa7A\xa6n'
data type: <class 'bytes'>


In [24]:
new_entry = sample_entry.decode('big5-tw').encode("utf-8", errors="replace")

## 9.4.2.2. Reading in files with encoding problems

In [29]:
police_killings_path = 'C:/Users/PC Maison/4-KAGGLE/KAGGLE_DEV/KAGGLE_COURS_9-DATA_CLEANING/fatal-police-shootings-in-the-us/input/'
police_killings = pd.read_csv(police_killings_path + 'PoliceKillingsUS.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x96 in position 2: invalid start byte

In [27]:
# (Optional) Use this code cell for any additional work.
# look at the first ten thousand bytes to guess the character encoding
with open(police_killings_path + 'PoliceKillingsUS.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(200000))

# check what the character encoding might be
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [30]:
# TODO: Load in the DataFrame correctly.
police_killings = pd.read_csv(police_killings_path + 'PoliceKillingsUS.csv',
                             encoding='Windows-1252')


## 9.4.2.3. Saving your files with UTF-8 encoding

In [31]:
police_killings.to_csv("my_file.csv")