## Python strings and bytes
- [unicode] strings
- bytes

In [1]:
name = "John"
print(name)
print(repr(name))
print(type(name))

John
'John'
<class 'str'>


In [2]:
data = b"pic-abc-qwerty"
print(data)
print(repr(data))
print(type(data))

b'pic-abc-qwerty'
b'pic-abc-qwerty'
<class 'bytes'>


In [3]:
b"foo" + "bar"

TypeError: can't concat str to bytes

In [4]:
"foo" + b"bar"

TypeError: can only concatenate str (not "bytes") to str

In [5]:
data

b'pic-abc-qwerty'

In [6]:
data.decode()

'pic-abc-qwerty'

In [7]:
b"John ".decode() + "Smith"

'John Smith'

In [8]:
b"\xabc"

b'\xabc'

In [9]:
b"\xabc".decode()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xab in position 0: invalid start byte

| Unicode code point | character | UTF-8 (hex.) |       name      |
|:------------------:|:---------:|:------------:|:---------------:|
|       U+00B1       |     ±     |     c2 b1    | PLUS-MINUS SIGN |

In [10]:
char = b"\xc2\xb1"

In [11]:
len(char)

2

In [12]:
char.decode()

'±'

### Windows cp-1251

| DEC | HEX | Symbol | Description                |
|:---:|:---:|:------:|----------------------------|
| 222 |  DE |    Ю   | Cyrillic capital letter Yu |

In [13]:
yu = b"\xde"
print(yu)
print(len(yu))

b'\xde'
1


In [14]:
yu.decode()

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xde in position 0: unexpected end of data

In [15]:
yu.decode("cp1251")

'Ю'

In [16]:
"Ю".encode()

b'\xd0\xae'

In [17]:
"Ю".encode("cp1251")

b'\xde'

| character |     UTF-16 (hex.)     |       name      |
|:---------:|:---------------------:|:---------------:|
|     Ю     |     0x042E (042e)     | CYRILLIC CAPITAL LETTER YU |
|     .     |     0x002E (002e)     | FULL STOP |

In [18]:
"Ю".encode("utf-16")

b'\xff\xfe.\x04'

In [19]:
b'\xff\xfe.\x04'.decode("utf-16")

'Ю'

In [20]:
u"\u042e"

'Ю'

In [21]:
"\u042e"

'Ю'

In [22]:
line = "Hello, Сурен!"
print(line)

Hello, Сурен!


In [23]:
line_bytes = line.encode("utf-8")
print(line_bytes)

b'Hello, \xd0\xa1\xd1\x83\xd1\x80\xd0\xb5\xd0\xbd!'


In [24]:
line_bytes.decode("utf-8")

'Hello, Сурен!'

In [25]:
line_bytes.decode("ascii")

UnicodeDecodeError: 'ascii' codec can't decode byte 0xd0 in position 7: ordinal not in range(128)

In [26]:
line_bytes.decode("ascii", "replace")

'Hello, ����������!'

In [27]:
line_bytes.decode("ascii", "backslashreplace")

'Hello, \\xd0\\xa1\\xd1\\x83\\xd1\\x80\\xd0\\xb5\\xd0\\xbd!'

In [28]:
line_bytes.decode("ascii", "ignore")

'Hello, !'