In [138]:
print('Python')
print(chr(128013))
print(ord('üêç'))

Python
üêç
128013


In [None]:
# Unicode

# To know what is unicode, lets see what is ASCII
# ASCII
# First there was the C programming language, then there was ASCII. 
# In ASCII, every letter, digits, and symbols that mattered (a-z, A-Z, 0‚Äì9, +, -, /, ‚Äú, ! etc.) 
# were represented as a number between 32 and 127. Most computers used 8-bits bytes then. 
# This meant each byte could store 2‚Å∏-1= 255 numbers. 
# So each byte (or unit of storage) had more than enough space to store the basic set of english characters

In [42]:
# Lets see number of representations that can be formed in 8 bits (1 byte)
# Then we will see 2, 3, 4, 5, 6 bytes

print(2**8)   # 1 byte
print(2**16)  # 2 bytes
print(2**24)  # 3 bytes
print(2**32)  # 4 bytes
print(2**40)  # 5 bytes
print(2**48)  # 6 bytes

# So ASCII used 1 byte, 2‚Å∏-1= 255 numbers

256
65536
16777216
4294967296
1099511627776
281474976710656


In [None]:
# In order to accommodate the non-english characters, 
# people started going a little crazy on how to use the numbers from 128 to 255 still available on a single byte. 
# Different people would use different characters for the same numbers. 
# Obviously, not only was it the wild wild west, but it quickly dawned that the extra available numbers 
# could not even come close to represent the complete set of characters for some languages

In [None]:
# So Unicode was born
# The Unicode was a brave attempt to create a single character set that could represent every characters 
# in every imaginable language systems.

In [None]:
# What is a Unicode
# Unicode (https://www.unicode.org/) is a specification that aims to list every character 
# used by human languages and give each character its own unique code.

# eg.
# For example, there‚Äôs a character for ‚ÄúRoman Numeral One‚Äù, ‚Äò‚Ö†‚Äô, that‚Äôs separate from the uppercase letter ‚ÄòI‚Äô. 
# They‚Äôll usually look the same, but these are two different characters that have different meanings.

# https://docs.python.org/3/howto/unicode.html
# https://medium.com/@apiltamang/unicode-utf-8-and-ascii-encodings-made-easy-5bfbe3a1c45a

# The Unicode standard describes how characters are represented by code points. 
# A code point value is an integer in the range 0 to 0x10FFFF 
# (about 1.1 million values, with some 110 thousand assigned so far). 
# A code point is written using the notation U+265E to mean the character with value 0x265e (9,822 in decimal).

In [46]:
# Code point range 0 to 0x10FFFF 

int(0x10FFFF)  # Around 10 lakh codes

1114111

In [52]:
# eg. 

'0x265E'

print(bin(0x265E))
print(int(0x265E))
print(oct(0x265E))


# Hexadecimal to Binary, Decimal and Oct represenation
# This is how its internally stored in memeory

0b10011001011110
9822
0o23136


In [53]:
type('0x265E')

# As we can see its of type string

str

In [50]:
'0x265E'.encode('UTF-8')

# The string format is not used for transmission
# So this will be converted into byte format

b'0x265E'

In [51]:
'0x265E'.encode('UTF-8').decode('UTF-8')

# At the receiving end it will be decoded
# Back to original code
# Now its in string format

'0x265E'

In [55]:
# But what character does this hexadecimal code represents?

chr(0x265E)

'‚ôû'

In [56]:
ord('‚ôû')   # This will give back the decimal format

9822

In [69]:
# This is how the numbers are repesented as chararctes in ascii format

for i in range(32,128):
    print(f'{i}  = {chr(i)}') 

32  =  
33  = !
34  = "
35  = #
36  = $
37  = %
38  = &
39  = '
40  = (
41  = )
42  = *
43  = +
44  = ,
45  = -
46  = .
47  = /
48  = 0
49  = 1
50  = 2
51  = 3
52  = 4
53  = 5
54  = 6
55  = 7
56  = 8
57  = 9
58  = :
59  = ;
60  = <
61  = =
62  = >
63  = ?
64  = @
65  = A
66  = B
67  = C
68  = D
69  = E
70  = F
71  = G
72  = H
73  = I
74  = J
75  = K
76  = L
77  = M
78  = N
79  = O
80  = P
81  = Q
82  = R
83  = S
84  = T
85  = U
86  = V
87  = W
88  = X
89  = Y
90  = Z
91  = [
92  = \
93  = ]
94  = ^
95  = _
96  = `
97  = a
98  = b
99  = c
100  = d
101  = e
102  = f
103  = g
104  = h
105  = i
106  = j
107  = k
108  = l
109  = m
110  = n
111  = o
112  = p
113  = q
114  = r
115  = s
116  = t
117  = u
118  = v
119  = w
120  = x
121  = y
122  = z
123  = {
124  = |
125  = }
126  = ~
127  = 


In [None]:
# What is Codepoint
# A code point is the atomic unit of information. 
# Text is a sequence of code points. 
# Each code point is a number which is given meaning by the Unicode standard.

# What is a Glyph
# A character is represented on a screen or on paper by a set of graphical elements that‚Äôs called a glyph. 
# The glyph for an uppercase A, for example, is two diagonal strokes and a horizontal stroke,
# though the exact details will depend on the font being used.

In [None]:
# What is an Encoding
# A Unicode string is a sequence of code points, which are numbers from 0 through 0x10FFFF (1,114,111 decimal). 
# This sequence of code points needs to be represented in memory as a set of code units, 
# and code units are then mapped to 8-bit bytes. 
# The rules for translating a Unicode string into a sequence of bytes are called a character encoding, or just an encoding.


# What is UTF-8 encoding
# UTF-8 is one of the most commonly used encodings, and Python often defaults to using it. 
# UTF stands for ‚ÄúUnicode Transformation Format‚Äù, and the ‚Äò8‚Äô means that 8-bit values are used in the encoding. 
# (There are also UTF-16 and UTF-32 encodings, but they are less frequently used than UTF-8.) 


# UTF-8 uses the following rules:
# If the code point is < 128, it‚Äôs represented by the corresponding byte value.
# If the code point is >= 128, it‚Äôs turned into a sequence of two, three, or four bytes, 
# where each byte of the sequence is between 128 and 255
# https://www.youtube.com/watch?v=MijmeoH9LT4

# UTF-8 has several convenient properties (one of the property is that)
# It can handle any Unicode code point.

In [80]:
name = 'Manoj'
print(name.encode('UTF-8'))
print(name.encode('UTF-16'))
print(name.encode('latin-1'))
print(len(name))

# Now if you see the len of name, its 5 chars and it makes sense

b'Manoj'
b'\xff\xfeM\x00a\x00n\x00o\x00j\x00'
b'Manoj'
5


In [79]:
# Lets see what the length after encode

print(len(name.encode('UTF-8')))
print(len(name.encode('UTF-16')))
print(len(name.encode('latin-1')))

# As we can see its using more bytes to store in UTF-16 format (2 bytes)

5
12
5


In [83]:
# Lets understand more using hebrew 'abcd'

hebrew = '◊ê ◊ë ◊í ◊ì'
print(hebrew)
print(hebrew.encode('UTF-8'))

◊ê ◊ë ◊í ◊ì
b'\xd7\x90 \xd7\x91 \xd7\x92 \xd7\x93'


In [84]:
print(hebrew[0])

# Now here we are getting back the same string format of letter 'a' in hebrew

◊ê


In [85]:
# What happens in bytes. Lets convert hebrew abcd to bytes

bytes(hebrew)

# But we are getting an error
# This is because we need to tell, how these are represented by bytes
# Or how its coverted to bytes, so that we can read it back from bytes to string
# This is where encoding tells the way it should be converted

TypeError: string argument without an encoding

In [90]:
bytes(hebrew, encoding = 'UTF-8')

# As you can see its converted to bytes
# This is same as hebrew.encode('UTF-8')
# Both will result in type bytes


b'\xd7\x90 \xd7\x91 \xd7\x92 \xd7\x93'

In [94]:
# We can convert the bytes using str + encoding type

b = bytes(hebrew, encoding = 'UTF-8')
print(str(b, encoding = 'UTF-8'))

# This is similar to b.decode('UTF-8')
print(b.decode('UTF-8'))

◊ê ◊ë ◊í ◊ì
◊ê ◊ë ◊í ◊ì


In [95]:
# Now from the bytes format, if we use wrong encoding type
# It will be converted, but you will not get the right result

b = bytes(hebrew, encoding = 'UTF-8')
print(b.decode('latin-1'))

√ó¬ê √ó¬ë √ó¬í √ó¬ì


In [None]:
# So basically we are converting from string to bytes
# and back to bytes to string

# This is used in reading file with .png format or from internet
# we cant just say open(filename, 'r')
# We need to specify
# open(filename, 'rb')
# This will read as bytes

In [96]:
# Byte representation
# https://www.youtube.com/watch?v=Vvmvo04qv-A

# To understand byte, we need to know the difference between byte and char

eng = 'Thank You'
jap = '„ÅÇ„Çä„Åå„Å®„ÅÜ„Åî„Åñ„ÅÑ„Åæ„Åó„Åü'
chi = 'Ë∞¢Ë∞¢'
kan = '‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'

print(len(eng), len(jap), len(chi), len(kan))

# Now this the length of each character in string or char format

9 11 2 7


In [97]:
# If we convert it into bytes, we can see the length is increased
# Of course while converting to bytes, we need to specify the conversion type

b_eng = eng.encode('UTF-8')
b_jap = jap.encode('UTF-8')
b_chi = chi.encode('UTF-8')
b_kan = kan.encode('UTF-8')

print(len(b_eng), len(b_jap), len(b_chi), len(b_kan))

# As you can see, for eng it remained same
# For jap, chi and kan, each char uses 3 bytes

9 33 6 21


In [99]:
# So we know each char takes different bytes in unicode

# Another eg.
# Lets find a char in a string

s = 'abcde'
'a' in s

True

In [100]:
# Now after converting to bytes we cant find 'a'
# To find 'a' , the char should be in byte format

b = s.encode('UTF-8')
b

b'abcde'

In [101]:
'a' in b

# This will give error

TypeError: a bytes-like object is required, not 'str'

In [102]:
b'a' in b

# This will work
# Also remember bytes are stored in numbers or hexa digits

True

In [105]:
# So we can find using numbers too

print(ord('a'))
97 in b

# Here are searched 'a' using its unicode

97


True

In [112]:
# If there is a mix of languages
# Then we will get error while decoding

msg = 'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'
print(msg)

b = msg.encode()
b.decode()

# This is working fine with UTF-8 sometimes we get error

I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶


'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'

In [113]:
# Lets use another encoding

msg = 'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'
print(msg)

b = msg.encode('latin-1')
b.decode('latin-1')

I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶


UnicodeEncodeError: 'latin-1' codec can't encode characters in position 14-20: ordinal not in range(256)

In [116]:
# We can use the options to ignore the error

msg = 'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'
b = msg.encode('latin-1', errors='ignore')
b.decode('latin-1')

'I want to say '

In [117]:
# We can even replace with ??

msg = 'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'
b = msg.encode('latin-1', errors='replace')
b.decode('latin-1')

'I want to say ???????'

In [120]:
help(msg.encode)

Help on built-in function encode:

encode(encoding='utf-8', errors='strict') method of builtins.str instance
    Encode the string using the codec registered for encoding.
    
    encoding
      The encoding in which to encode the string.
    errors
      The error handling scheme to use for encoding errors.
      The default is 'strict' meaning that encoding errors raise a
      UnicodeEncodeError.  Other possible values are 'ignore', 'replace' and
      'xmlcharrefreplace' as well as any other name registered with
      codecs.register_error that can handle UnicodeEncodeErrors.



In [119]:
# Or we can convert to xml entiies

msg = 'I want to say ‡≤ß‡≤®‡≥ç‡≤Ø‡≤µ‡≤æ‡≤¶'
b = msg.encode('latin-1', errors='xmlcharrefreplace')
b.decode('latin-1')

'I want to say &#3239;&#3240;&#3277;&#3247;&#3253;&#3262;&#3238;'

In [130]:
# Bytearrays

# Byte is immutable, whereas bytearray is immutable
# This is useful when we are reading bits of data from web and we want to modify it
# Then we need an mutable array

# eg.

b = bytes('abcd','utf-8')
b

b'abcd'

In [125]:
b[0]

97

In [127]:
b[0] = 65

# As you can see we get error as byte is immutable

TypeError: 'bytes' object does not support item assignment

In [132]:
# How to create bytearray

ba= bytearray('abcd', 'utf-8')
ba

# This is now of type bytearray

bytearray(b'abcd')

In [133]:
ba[0]

97

In [134]:
ba[0] = 65
ba[2] = 67
print(ba)

# As you can see, we have changed 2 chars to uppercase

bytearray(b'AbCd')


In [136]:
# All other functions that supports string and bytes will work on bytearrays

print(b[0])
print(b[-1])
print(b[1:4])

97
100
b'bcd'


In [12]:
# Note: Out of all the unicode encodings, UTF-8 has a neat hack while encoding.

str = '◊ê ◊ë ◊í ◊ì'
bytes(str, 'UTF-8')  # We get bytes

b'\xd7\x90 \xd7\x91 \xd7\x92 \xd7\x93'

In [13]:
# As you can see there are d7 everywhere
# To see it clearly lets convert to hex() string

bytes(str, 'UTF-8').hex()

'd79020d79120d79220d793'

In [None]:
# This is because UTF-8 uses 110xxxxx 10xxxxxx 10xxxxxx 10xxxxxx format while converting
# For one it preserves the ASCII codes and also it prevents from
# sending 0 bytes which indicates null values

# eg. 110xxxxx 00000000 00000001
# Here if a character uses 3 bytes per character and if the 2nd byte
# contains all zeros then, while reading stream of bytes this marks
# as end of string. To prevent this 110 and 10 is prefixed in all bytes

# For more info
# https://www.youtube.com/watch?v=MijmeoH9LT4