# Iterations

In [2]:
import math
import numpy as np
from itertools import zip_longest
import locale
import csv # in python standard library
import pandas as pd # scientific python

In [3]:
cities = ["Nice", "Lyon", "Caen", "Pau"]
city_t = ("Nice",  342_522, "06000")
city_d = { 
    "name": "Nice",
    "population": 342_522,
    "zipcode": "06000"
}

In [4]:
cities # uses repr

['Nice', 'Lyon', 'Caen', 'Pau']

In [5]:
print(cities) # uses str

['Nice', 'Lyon', 'Caen', 'Pau']


In [6]:
city = cities[0]
print(city)
city

Nice


'Nice'

In [7]:
print(str(city), repr(city))

Nice 'Nice'


In [8]:
for city in cities:
    print(city)

Nice
Lyon
Caen
Pau


In [9]:
# TypeError: 'int' object is not iterable
# for x in 12345:
#     print(x)

In [10]:
city = "Saint-Remy-en-Bouzemont-Saint-Genest-et-Isson"
for letter in city:
    print(letter, end='  ')

S  a  i  n  t  -  R  e  m  y  -  e  n  -  B  o  u  z  e  m  o  n  t  -  S  a  i  n  t  -  G  e  n  e  s  t  -  e  t  -  I  s  s  o  n  

In [11]:
for info in city_t:
    print(info)

Nice
342522
06000


In [12]:
# default iteration on dict: keys
for info_name in city_d:
    print(info_name)

name
population
zipcode


In [13]:
for info_name in city_d.keys():
    print(info_name)

name
population
zipcode


In [14]:
for info_value in city_d.values():
    print(info_value)

Nice
342522
06000


In [15]:
for info_name, info_value in city_d.items():
    print(info_name, info_value, sep=": ")

name: Nice
population: 342522
zipcode: 06000


In [16]:
enumerate

enumerate

In [17]:
type(city_d)

dict

In [18]:
# aide de python
help(city_t.count)

Help on built-in function count:

count(value, /) method of builtins.tuple instance
    Return number of occurrences of value.



In [19]:
# aide ipython, notebook
city_t.count?

[1;31mSignature:[0m [0mcity_t[0m[1;33m.[0m[0mcount[0m[1;33m([0m[0mvalue[0m[1;33m,[0m [1;33m/[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m Return number of occurrences of value.
[1;31mType:[0m      builtin_function_or_method

In [20]:
# each iterable object has a hidden method __iter__
cities.__iter__

<method-wrapper '__iter__' of list object at 0x000001B7113C3B00>

In [21]:
list.__iter__

<slot wrapper '__iter__' of 'list' objects>

In [22]:
x = 123 

In [23]:
# AttributeError: 'int' object has no attribute '__iter__'
# x.__iter__

In [24]:
it = cities.__iter__()
it

<list_iterator at 0x1b7139bbee0>

In [25]:
it = iter(cities) # call cities.__iter__()
it

<list_iterator at 0x1b7139f2200>

In [26]:
# TypeError: 'int' object is not iterable
# iter(x)

In [27]:
len(cities) # calls cities.__len__()

4

In [28]:
for obj in cities, city_t, city_d, city:
    print(obj, type(obj), len(obj), obj.__len__())

['Nice', 'Lyon', 'Caen', 'Pau'] <class 'list'> 4 4
('Nice', 342522, '06000') <class 'tuple'> 3 3
{'name': 'Nice', 'population': 342522, 'zipcode': '06000'} <class 'dict'> 3 3
Saint-Remy-en-Bouzemont-Saint-Genest-et-Isson <class 'str'> 45 45


In [29]:
"Pau" in cities

True

In [30]:
it = iter(cities)
it

<list_iterator at 0x1b7139f3760>

In [31]:
# execute this cell until exception StopIteration
next(it)

'Nice'

In [32]:
# code equivalent to 'loop for'
it = iter(cities)
while True:
    try:
        city = next(it)
        # do something with current value
        print(city)
    except StopIteration:
        break

Nice
Lyon
Caen
Pau


In [33]:
sum([12, 33, 45])

90

In [34]:
12 + 45

57

In [35]:
for i, city in enumerate(cities):
    print(i, city, sep=" - ")

0 - Nice
1 - Lyon
2 - Caen
3 - Pau


In [36]:
for i, city in enumerate(cities, start=1):
    print(i, city, sep=" - ")

1 - Nice
2 - Lyon
3 - Caen
4 - Pau


In [37]:
long_city = "Saint-Remy-en-Bouzemont-Saint-Genest-et-Isson"
for i, letter in enumerate(long_city, start=1):
    print(i, letter, sep=" - ")

1 - S
2 - a
3 - i
4 - n
5 - t
6 - -
7 - R
8 - e
9 - m
10 - y
11 - -
12 - e
13 - n
14 - -
15 - B
16 - o
17 - u
18 - z
19 - e
20 - m
21 - o
22 - n
23 - t
24 - -
25 - S
26 - a
27 - i
28 - n
29 - t
30 - -
31 - G
32 - e
33 - n
34 - e
35 - s
36 - t
37 - -
38 - e
39 - t
40 - -
41 - I
42 - s
43 - s
44 - o
45 - n


In [38]:
enumerate(long_city)

<enumerate at 0x1b711354a90>

In [39]:
r = range(1_000_000)
r

range(0, 1000000)

In [40]:
sum(r)

499999500000

In [41]:
print(range(1_000_000))

range(0, 1000000)


In [42]:
values = list(range(10))
values

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [43]:
list(enumerate(cities, start=1))

[(1, 'Nice'), (2, 'Lyon'), (3, 'Caen'), (4, 'Pau')]

In [44]:
cities.extend(["Toulouse", "Marseille"]) # extend with an iterable list
cities

['Nice', 'Lyon', 'Caen', 'Pau', 'Toulouse', 'Marseille']

In [45]:
cities.extend(("Paris", "Bordeaux"))
cities

['Nice', 'Lyon', 'Caen', 'Pau', 'Toulouse', 'Marseille', 'Paris', 'Bordeaux']

In [46]:
numbers = [12, 34, 56]
numbers.extend(range(10))
numbers

[12, 34, 56, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [47]:
z = zip('abcdefg', range(3), range(4))
z

<zip at 0x1b713a61ec0>

In [48]:
list(z)

[('a', 0, 0), ('b', 1, 1), ('c', 2, 2)]

In [49]:
list(zip_longest('abcdefg', range(3), range(4)))

[('a', 0, 0),
 ('b', 1, 1),
 ('c', 2, 2),
 ('d', None, 3),
 ('e', None, None),
 ('f', None, None),
 ('g', None, None)]

In [50]:
zip_longest

itertools.zip_longest

In [51]:
math.sqrt

<function math.sqrt(x, /)>

In [52]:
np.sum

<function sum at 0x000001B7114BFF70>

In [53]:
# an iterator is iterable
it = iter(cities)
it

<list_iterator at 0x1b713a10f40>

In [54]:
it2 = iter(it)
it2

<list_iterator at 0x1b713a10f40>

In [55]:
it is it2

True

In [56]:
12 == 24 * 2 // 4

True

In [57]:
city.upper().lower() == city.lower()

True

In [58]:
city.upper().lower() is city.lower()

False

In [59]:
list(it)

['Nice', 'Lyon', 'Caen', 'Pau', 'Toulouse', 'Marseille', 'Paris', 'Bordeaux']

## Expression for

In [61]:
cities

['Nice', 'Lyon', 'Caen', 'Pau', 'Toulouse', 'Marseille', 'Paris', 'Bordeaux']

In [62]:
# list comprehension: expression for with [ ]
[ city.upper() for city in cities ]

['NICE', 'LYON', 'CAEN', 'PAU', 'TOULOUSE', 'MARSEILLE', 'PARIS', 'BORDEAUX']

In [63]:
# expression for a generator
g = (city.upper() for city in cities)
g

<generator object <genexpr> at 0x000001B710F9A400>

In [64]:
list(g)

['NICE', 'LYON', 'CAEN', 'PAU', 'TOULOUSE', 'MARSEILLE', 'PARIS', 'BORDEAUX']

In [65]:
list(g)

[]

In [66]:
# compute total number of letters to write all cities
sum(len(city) for city in cities)

45

In [67]:
total = 0
for city in cities:
    total += len(city)
total

45

In [68]:
city_lengths = [ len(city) for city in cities ]
city_lengths

[4, 4, 4, 3, 8, 9, 5, 8]

In [69]:
total = 0
for l in city_lengths:
    total += l
total

45

In [70]:
# dict comprehension
{ city: len(city) for city in cities }

{'Nice': 4,
 'Lyon': 4,
 'Caen': 4,
 'Pau': 3,
 'Toulouse': 8,
 'Marseille': 9,
 'Paris': 5,
 'Bordeaux': 8}

In [71]:
length_threshold = 3
[len(city) >= length_threshold for city in cities]

[True, True, True, True, True, True, True, True]

In [72]:
all(len(city) >= length_threshold for city in cities)

True

In [73]:
any(len(city) >= length_threshold for city in cities)

True

In [74]:
sum(len(city) for city in cities)

45

In [75]:
sum((len(city) for city in cities), start=1000)

1045

In [76]:
# non optimal: list comprehension vs generator expresion
print(sum([len(city) for city in cities]))
print(sum([len(city) for city in cities], start=1000))

45
1045


## Different type of containers

In [78]:
numbers = [12, 33, 56, 12, 2, 89]

In [79]:
tuple(numbers)

(12, 33, 56, 12, 2, 89)

In [80]:
# set: each value is unique, no doubles
set(numbers)

{2, 12, 33, 56, 89}

In [81]:
list(set(numbers))

[33, 2, 12, 56, 89]

In [82]:
list(x**2+1 for x in numbers)

[145, 1090, 3137, 145, 5, 7922]

## Sorting

In [84]:
numbers

[12, 33, 56, 12, 2, 89]

In [85]:
# in place sort (type list)
numbers.sort()
numbers

[2, 12, 12, 33, 56, 89]

In [86]:
# builtin sorted: => new list
results = sorted(x**2+1 for x in set(numbers))
results

[5, 145, 1090, 3137, 7922]

In [87]:
12 < 24

True

In [88]:
12.4 < 25.3

True

In [89]:
False < True

True

In [90]:
"Toulouse" < "Paris"

False

In [91]:
"Toulouse" < "Toulon"

False

In [92]:
print(cities)
cities.sort()
print(cities)

['Nice', 'Lyon', 'Caen', 'Pau', 'Toulouse', 'Marseille', 'Paris', 'Bordeaux']
['Bordeaux', 'Caen', 'Lyon', 'Marseille', 'Nice', 'Paris', 'Pau', 'Toulouse']


In [93]:
cities.append("arcachon")
print(cities)
cities.sort()
print(cities)

['Bordeaux', 'Caen', 'Lyon', 'Marseille', 'Nice', 'Paris', 'Pau', 'Toulouse', 'arcachon']
['Bordeaux', 'Caen', 'Lyon', 'Marseille', 'Nice', 'Paris', 'Pau', 'Toulouse', 'arcachon']


In [94]:
"Z" < 'a'

True

In [95]:
sorted(cities, key=str.upper)

['arcachon',
 'Bordeaux',
 'Caen',
 'Lyon',
 'Marseille',
 'Nice',
 'Paris',
 'Pau',
 'Toulouse']

In [96]:
cities.sort(key=str.lower)
cities

['arcachon',
 'Bordeaux',
 'Caen',
 'Lyon',
 'Marseille',
 'Nice',
 'Paris',
 'Pau',
 'Toulouse']

In [97]:
cities.sort(key=str.casefold)
print(cities)

['arcachon', 'Bordeaux', 'Caen', 'Lyon', 'Marseille', 'Nice', 'Paris', 'Pau', 'Toulouse']


In [98]:
cities.extend(('Charbonnières-les-Bains', 'Nîmes', 'Niort', "L'Haÿ-les-Roses"))
cities.sort(key=str.casefold) # Nîmes is not at the 'right place'
cities

['arcachon',
 'Bordeaux',
 'Caen',
 'Charbonnières-les-Bains',
 "L'Haÿ-les-Roses",
 'Lyon',
 'Marseille',
 'Nice',
 'Niort',
 'Nîmes',
 'Paris',
 'Pau',
 'Toulouse']

In [99]:
'mañana'

'mañana'

## locale parameters
- sorting (LC_COLLATE)
- format de date (LC_TIME)
- monnaie (LC_MONETARY)
- decimal separator (, .)  (LC_NUMERIC)

In [101]:
locale.getlocale(locale.LC_COLLATE)

(None, None)

In [102]:
locale.setlocale(locale.LC_ALL, 'fr_fr.UTF8')

'fr_fr.UTF8'

In [103]:
locale.getlocale(locale.LC_COLLATE)

('fr_FR', 'UTF-8')

In [104]:
cities.sort(key=locale.strxfrm)
cities

['arcachon',
 'Bordeaux',
 'Caen',
 'Charbonnières-les-Bains',
 "L'Haÿ-les-Roses",
 'Lyon',
 'Marseille',
 'Nice',
 'Nîmes',
 'Niort',
 'Paris',
 'Pau',
 'Toulouse']

In [105]:
sorted(('cœur', 'cobra', 'corde', 'garçon', 'gars', 'garce'), key=locale.strxfrm)

['cobra', 'cœur', 'corde', 'garce', 'garçon', 'gars']

In [106]:
sorted(('mañana', 'mano', 'matador'), key=locale.strxfrm)

['mañana', 'mano', 'matador']

In [107]:
locale.setlocale(locale.LC_ALL, 'es_ES.UTF8')
locale.getlocale(locale.LC_COLLATE)

('es_ES', 'UTF-8')

In [108]:
sorted(('mañana', 'mano', 'matador'), key=locale.strxfrm)

['mano', 'mañana', 'matador']

## Encoding
Europe Occidentale: 1 caractère = 1 octet
- ASCII (pas d'accent): 128 caracatères (1963)
- latin-1 / ISO-8859-1: 256 caractères (lettres latines avec accents, manque œ, Œ, Ÿ
- ISO-8859-15: adaptation du latin-1 avec €, œ, Œ, Ÿ notamment
- Microsoft CP1252 ou ANSI: idem que ISO-8859-15 mais avec des codes différents pour les nouvelles lettres

Unicode: international, toutes langues écrites
- classification
- algorithmes (tri, comparaison)
- encoding informatique: UTF-8, UTF-16, UTF-32

In [110]:
# NB: 'ISO-8859-1': UnicodeEncodeError: 'latin-1' codec can't encode character '\u20ac'
for encoding in 'UTF-8', 'UTF-16', 'ISO-8859-15', 'CP1252': 
    print(encoding, '€'.encode(encoding), sep=": ")

UTF-8: b'\xe2\x82\xac'
UTF-16: b'\xff\xfe\xac '
ISO-8859-15: b'\xa4'
CP1252: b'\x80'


In [111]:
words = ['東京', '北京', '🍹🦜']
words

['東京', '北京', '🍹🦜']

### Lecture fichier texte brut
Fichier data/cities.csv téléchargé depuis https://www.data.gouv.fr/fr/datasets/villes-de-france/

In [113]:
# use default OS encoding
f = open('data/cities.csv')
f

<_io.TextIOWrapper name='data/cities.csv' mode='r' encoding='cp1252'>

In [114]:
f.close()

In [115]:
f = open('data/cities.csv', encoding='UTF-8')
f

<_io.TextIOWrapper name='data/cities.csv' mode='r' encoding='UTF-8'>

In [116]:
data = list(f)
data[:5]

['insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name\n',
 '25620,ville du pont,25650,ville du pont,46.999873398,6.498147193,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25624,villers grelot,25640,villers grelot,47.361512085,6.235167025,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25615,villars les blamont,25310,villars les blamont,47.368383721,6.871414913,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25619,les villedieu,25240,les villedieu,46.713906258,6.26583065,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n']

In [117]:
f.close()

In [118]:
with open('data/cities.csv', encoding='UTF-8') as f:
    # read whole file
    data = list(f)
# f.close() auto
data[:5]

['insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name\n',
 '25620,ville du pont,25650,ville du pont,46.999873398,6.498147193,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25624,villers grelot,25640,villers grelot,47.361512085,6.235167025,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25615,villars les blamont,25310,villars les blamont,47.368383721,6.871414913,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n',
 '25619,les villedieu,25240,les villedieu,46.713906258,6.26583065,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté\n']

In [119]:
len(data)

39146

### Lecture fichier csv avec module csv

In [244]:
with open('data/cities.csv', encoding='UTF-8') as f:
    reader = csv.reader(f)
    headers = next(reader)
    data = list(reader)
print(headers)
data[:5]

['insee_code', 'city_code', 'zip_code', 'label', 'latitude', 'longitude', 'department_name', 'department_number', 'region_name', 'region_geojson_name']


[['25620',
  'ville du pont',
  '25650',
  'ville du pont',
  '46.999873398',
  '6.498147193',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25624',
  'villers grelot',
  '25640',
  'villers grelot',
  '47.361512085',
  '6.235167025',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25615',
  'villars les blamont',
  '25310',
  'villars les blamont',
  '47.368383721',
  '6.871414913',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25619',
  'les villedieu',
  '25240',
  'les villedieu',
  '46.713906258',
  '6.26583065',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25622',
  'villers buzon',
  '25170',
  'villers buzon',
  '47.228558434',
  '5.852186748',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté']]

### Lecture fichier csv avec librairie pandas

In [122]:
df_cities = pd.read_csv('data/cities.csv', encoding='UTF-8')
df_cities # type: DataFrame

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
0,25620,ville du pont,25650,ville du pont,46.999873,6.498147,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
1,25624,villers grelot,25640,villers grelot,47.361512,6.235167,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
2,25615,villars les blamont,25310,villars les blamont,47.368384,6.871415,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
3,25619,les villedieu,25240,les villedieu,46.713906,6.265831,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
4,25622,villers buzon,25170,villers buzon,47.228558,5.852187,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
...,...,...,...,...,...,...,...,...,...,...
39140,98829,thio,98829,thio,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39141,98831,voh,98833,voh,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39142,98832,yate,98834,yate,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39143,98612,sigave,98620,sigave,-14.270411,-178.155263,wallis-et-futuna,986,wallis-et-futuna,Wallis-et-Futuna


In [253]:
# dynamic attribute
df_cities.zip_code

0        25650
1        25640
2        25310
3        25240
4        25170
         ...  
39140    98829
39141    98833
39142    98834
39143    98620
39144    98600
Name: zip_code, Length: 39145, dtype: int64