# CHARACTER ENCODING

In [1]:
# MODULES WE'LL USE
import pandas as pd 
import numpy as np

In [2]:
# helpful character encoding module
import chardet

In [4]:
# set sed for reproducibility
np.random.seed(0)

In [6]:
'''
  What are encodings?
  Character encodings are specific sets of rules for mapping from raw binary byte strings (that look like this: 0110100001101001) to characters that make up human-readable text (like "hi"). '''

'\n  What are encodings?\n  Character encodings are specific sets of rules for mapping from raw binary byte strings (that look like this: 0110100001101001) to characters that make up human-readable text (like "hi"). '

In [7]:
#ke" (said like mo-gee-bah-kay).example of mojibake: æ–‡å—åŒ–ã??
# "unknown" characters . example of unknown :  ����������

In [9]:
""" There are lots of different character encodings, but the main one you need to know is UTF-8.

    UTF-8 is the standard text encoding. All Python code is in UTF-8 and, ideally, all your data should be as well. It's when things aren't in UTF-8 that you run into trouble."""

" There are lots of different character encodings, but the main one you need to know is UTF-8.\n\n    UTF-8 is the standard text encoding. All Python code is in UTF-8 and, ideally, all your data should be as well. It's when things aren't in UTF-8 that you run into trouble."

In [26]:
# start with a string
before = 'this is the dolar symbol: €'
# press alt - 0128 to get '€'

In [27]:
before

'this is the dolar symbol: €'

In [28]:
type(before)

str

In [21]:
#'The other data is the bytes data type, which is a sequence of integers. 
# You can convert a string into bytes by specifying which encoding it's in':

In [29]:
# encode it to be a different encoding, replacing characters that raise errors
after = before.encode('utf-8', errors = 'replace')

In [30]:
type(after)

bytes

In [31]:
after

b'this is the dolar symbol: \xe2\x82\xac'

In [32]:
# when we convert our bytes back to a string with the correct encoding, 
# we can see that our text is all there correctly.

In [33]:
# convert it back to utf-8
print(after.decode("utf-8"))

this is the dolar symbol: €


In [34]:
# try to decode our bytes with the ascii encoding
print(after.decode('ascii'))

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe2 in position 26: ordinal not in range(128)

In [35]:
# strings are UTF-8 by default in Python 3

In [36]:
'''The dangerous part about this is that there's not way to tell which character it should have been. That means we may have just made our data unusable!'''

"The dangerous part about this is that there's not way to tell which character it should have been. That means we may have just made our data unusable!"

In [37]:
# start with a string
before = 'This is the euro symbol: €'

In [38]:
# encode it to a different encoding, replacing characters that raise errors
after = before.encode('ascii', errors='replace')

In [39]:
# convert it back to utf-8
print(after.decode('ascii'))

This is the euro symbol: ?


In [40]:
# we have lost the original underlying byte string!
# it's been replaced with the above byte string for unknown character:

### READING IN FILES WITH ENCODING PROBLEMS

In [42]:
# try to read in a file not in utf-8
kickstarter_2016 = pd.read_csv('ks-projects-201612.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte

In [43]:
"""Notice that we get the same UnicodeDecodeError we got when we tried to decode UTF-8 bytes as if they were ASCII! This tells us that this file isn't actually UTF-8"""

"Notice that we get the same UnicodeDecodeError we got when we tried to decode UTF-8 bytes as if they were ASCII! This tells us that this file isn't actually UTF-8"

In [45]:
# look at the first ten thousand bytes to guess the character encoding
with open('ks-projects-201612.csv', 'rb') as rawdata:
    result = chardet.detect(rawdata.read(10000))

In [46]:
print(result)

{'encoding': 'Windows-1252', 'confidence': 0.73, 'language': ''}


In [47]:
# so chardet is 73% confidence that the right encoding is "windows-1252"

In [48]:
# read in the file with the encoding detected by chardet
kickstarter_2016 = pd.read_csv('ks-projects-201612.csv', encoding='Windows-1252')

kickstarter_2016.head()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,


In [49]:
# chardet was right , the file reads in with no problem

### saving your files with UTF-8 encoding

In [51]:
# save our file (will be saved as UTF-8 by default!)
kickstarter_2016.to_csv('ks-projects-201612-utf8.csv')

In [55]:
ks = pd.read_csv('ks-projects-201612.csv')
# here we got error

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x99 in position 11: invalid start byte

In [58]:
ks2= pd.read_csv('ks-projects-201612-utf8.csv')
ks2
# here we didnt get error

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0.1,Unnamed: 0,ID,name,category,main_category,currency,deadline,goal,launched,pledged,state,backers,country,usd pledged,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16
0,0,1000002330,The Songs of Adelaide & Abullah,Poetry,Publishing,GBP,2015-10-09 11:36:00,1000,2015-08-11 12:12:28,0,failed,0,GB,0,,,,
1,1,1000004038,Where is Hank?,Narrative Film,Film & Video,USD,2013-02-26 00:20:50,45000,2013-01-12 00:20:50,220,failed,3,US,220,,,,
2,2,1000007540,ToshiCapital Rekordz Needs Help to Complete Album,Music,Music,USD,2012-04-16 04:24:11,5000,2012-03-17 03:24:11,1,failed,1,US,1,,,,
3,3,1000011046,Community Film Project: The Art of Neighborhoo...,Film & Video,Film & Video,USD,2015-08-29 01:00:00,19500,2015-07-04 08:35:03,1283,canceled,14,US,1283,,,,
4,4,1000014025,Monarch Espresso Bar,Restaurants,Food,USD,2016-04-01 13:38:27,50000,2016-02-26 13:38:27,52375,successful,224,US,52375,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
323745,323745,999976400,ChknTruk Nationwide Charity Drive 2014 (Canceled),Documentary,Film & Video,USD,2014-10-17 02:35:30,50000,2014-09-17 02:35:30,25,canceled,1,US,25,,,,
323746,323746,999977640,The Tribe,Narrative Film,Film & Video,USD,2011-07-19 03:35:14,1500,2011-06-22 03:35:14,155,failed,5,US,155,,,,
323747,323747,999986353,Walls of Remedy- New lesbian Romantic Comedy f...,Narrative Film,Film & Video,USD,2010-08-16 05:59:00,15000,2010-07-01 19:40:30,20,failed,1,US,20,,,,
323748,323748,999987933,BioDefense Education Kit,Technology,Technology,USD,2016-02-13 02:00:00,15000,2016-01-13 18:13:53,200,failed,6,US,200,,,,
