# File and File System
builtin function: open
modules:
- FS: pathlib, os.path
- file management (copy, remove, move, ...): shutil
- file content according to its format: csv, xml.etree, json, ..., pandas

In [84]:
from pathlib import Path
from datetime import datetime
import shutil
import re

## FileSystem and File management

In [2]:
currentDir = Path(".")
currentDir

WindowsPath('.')

In [3]:
dataDir = Path("data/cities/")
dataDir

WindowsPath('data/cities')

In [4]:
dataDir = Path("data\\cities")
dataDir

WindowsPath('data/cities')

In [5]:
# concatenate paths: method joinpath  or operator /
dataDir = currentDir / "data" / "cities"
dataDir

WindowsPath('data/cities')

In [6]:
dataDir.parent

WindowsPath('data')

In [7]:
# relative path . or root directory are their own parent
currentDir.parent

WindowsPath('.')

In [8]:
absoluteCurrentDir = currentDir.absolute()
absoluteCurrentDir

WindowsPath('C:/Users/Matthias/Documents/Formations/Python/Stage202402/Files')

In [9]:
for p in absoluteCurrentDir.parents:
    print(repr(p))
    print(p)
    print()

WindowsPath('C:/Users/Matthias/Documents/Formations/Python/Stage202402')
C:\Users\Matthias\Documents\Formations\Python\Stage202402

WindowsPath('C:/Users/Matthias/Documents/Formations/Python')
C:\Users\Matthias\Documents\Formations\Python

WindowsPath('C:/Users/Matthias/Documents/Formations')
C:\Users\Matthias\Documents\Formations

WindowsPath('C:/Users/Matthias/Documents')
C:\Users\Matthias\Documents

WindowsPath('C:/Users/Matthias')
C:\Users\Matthias

WindowsPath('C:/Users')
C:\Users

WindowsPath('C:/')
C:\



In [10]:
currentDir.exists(), currentDir.is_dir()

(True, True)

In [11]:
for f in currentDir.iterdir():
    print(f)
    print("\t-name:", f.name)
    print("\t-is directory:", f.is_dir())
    print("\t-is directory:", f.is_file())

.ipynb_checkpoints
	-name: .ipynb_checkpoints
	-is directory: True
	-is directory: False
data
	-name: data
	-is directory: True
	-is directory: False
files.ipynb
	-name: files.ipynb
	-is directory: False
	-is directory: True


In [12]:
st = f.stat()
st

os.stat_result(st_mode=33206, st_ino=4785074607527690, st_dev=8842262613728869279, st_nlink=1, st_uid=0, st_gid=0, st_size=43915, st_atime=1707465927, st_mtime=1707408565, st_ctime=1707397177)

In [13]:
# size in bytes
st.st_size

43915

In [14]:
dt_created = datetime.fromtimestamp(st.st_ctime)
dt_modified = datetime.fromtimestamp(st.st_mtime)
print("created:", dt_created)
print("modified:", dt_modified)

created: 2024-02-08 13:59:37.301141
modified: 2024-02-08 17:09:25.453221


In [15]:
trainingDir = absoluteCurrentDir.parent
trainingDir

WindowsPath('C:/Users/Matthias/Documents/Formations/Python/Stage202402')

In [16]:
for f in trainingDir.glob("**/*.py"):
    if ".ipynb_checkpoints" in (p.name for p in f.parents):
        continue
    print(f)

C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\app.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\basics.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\euclide.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\palindrome.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\test_euclide.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\test_palindrome.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\app.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\magicsquare.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\squares.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\test_magicsquare.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\test_magicsquare_integration.py
C:\Users\Matthias\Documents\Formations\Python\Stage202402\POO\app.py
C:\Users\Matthias\Documents\F

In [17]:
for f in trainingDir.glob("**/*.ipynb"):
    if ".ipynb_checkpoints" in (p.name for p in f.parents):
        continue
    print(f)

C:\Users\Matthias\Documents\Formations\Python\Stage202402\Basics\basics.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\Files\files.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\MagicSquare\magicsquare.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\POO\comparison_sort.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\POO\functions.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\POO\oop.ipynb
C:\Users\Matthias\Documents\Formations\Python\Stage202402\POO\temporal_data.ipynb


In [18]:
# copy file data/cities/cities.csv => data/backup/cities.csv
# Hint: method of class Path or module shutil

In [19]:
# NB: joinpath or operator / give the same results:
old_path = absoluteCurrentDir.joinpath("data", "cities", "cities.csv")  
print('old_path', old_path) 
new_path = absoluteCurrentDir.joinpath("data", "backup")
# if not new_path.exists():
#     new_path.mkdir()
new_path.mkdir(parents=True, exist_ok=True) # eq shell: mkdir -p
print('new_path', new_path) 
# copy file from old_path -> new_path directory
# NB: cf also copy, copyfile, copystat, copytree
shutil.copy2(old_path, new_path)

old_path C:\Users\Matthias\Documents\Formations\Python\Stage202402\Files\data\cities\cities.csv
new_path C:\Users\Matthias\Documents\Formations\Python\Stage202402\Files\data\backup


'C:\\Users\\Matthias\\Documents\\Formations\\Python\\Stage202402\\Files\\data\\backup\\cities.csv'

In [20]:
# rename data/backup/cities.csv in data/backup/cities_fr.csv
cityPath = new_path / "cities.csv"
newCityPath = new_path / "cities_fr.csv"
assert cityPath.exists() and not newCityPath.exists()
print(cityPath, newCityPath, sep=" \n -> ")

# then Path.rename or shutil.move (NB:see also movetree)
# cityPath.rename(newCityPath)
shutil.move(cityPath, newCityPath)
assert not cityPath.exists() and newCityPath.exists()

C:\Users\Matthias\Documents\Formations\Python\Stage202402\Files\data\backup\cities.csv 
 -> C:\Users\Matthias\Documents\Formations\Python\Stage202402\Files\data\backup\cities_fr.csv


In [21]:
# delete data/backup/cities_fr.csv
newCityPath.unlink()

## Read/Write files

In [22]:
# open a file given by a Path object
f = open(old_path)
f

<_io.TextIOWrapper name='C:\\Users\\Matthias\\Documents\\Formations\\Python\\Stage202402\\Files\\data\\cities\\cities.csv' mode='r' encoding='cp1252'>

In [23]:
# f (text mode) is iterable on its line
for _, line in zip(range(10), f):
    print(line)

insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name

25620,ville du pont,25650,ville du pont,46.999873398,6.498147193,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25624,villers grelot,25640,villers grelot,47.361512085,6.235167025,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25615,villars les blamont,25310,villars les blamont,47.368383721,6.871414913,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25619,les villedieu,25240,les villedieu,46.713906258,6.26583065,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25622,villers buzon,25170,villers buzon,47.228558434,5.852186748,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25625,villers la combe,25510,villers la combe,47.240809828,6.473842387,doubs,25,bourgogne-franche-comtÃ©,Bourgogne-Franche-ComtÃ©

25627,villers sous chalamont,25270,villers sous chalamont,46.901588322,6.045328224,doubs,25,bourgogne-franche

In [24]:
f.close()

In [25]:
f = open(old_path, encoding='UTF-8')
f

<_io.TextIOWrapper name='C:\\Users\\Matthias\\Documents\\Formations\\Python\\Stage202402\\Files\\data\\cities\\cities.csv' mode='r' encoding='UTF-8'>

In [26]:
for _, line in zip(range(10), f):
    print(line)

insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name

25620,ville du pont,25650,ville du pont,46.999873398,6.498147193,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25624,villers grelot,25640,villers grelot,47.361512085,6.235167025,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25615,villars les blamont,25310,villars les blamont,47.368383721,6.871414913,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25619,les villedieu,25240,les villedieu,46.713906258,6.26583065,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25622,villers buzon,25170,villers buzon,47.228558434,5.852186748,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25625,villers la combe,25510,villers la combe,47.240809828,6.473842387,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté

25627,villers sous chalamont,25270,villers sous chalamont,46.901588322,6.045328224,doubs,25,bourgogne-franche-comté,Bourg

In [27]:
f.close()

In [28]:
'é'.encode('UTF-8')

b'\xc3\xa9'

In [29]:
'é'.encode('CP1252')

b'\xe9'

In [30]:
for letter in 'é', '€', 'Ÿ':
    print(letter, end=': ')
    for encoding in 'ISO-8859-1', 'ISO-8859-15', 'CP1252', 'UTF-8':
        try:
            code = letter.encode(encoding)
        except UnicodeEncodeError:
            code = '<unknown>'
        print(encoding,'=',code, end=' / ')
    print()
    

é: ISO-8859-1 = b'\xe9' / ISO-8859-15 = b'\xe9' / CP1252 = b'\xe9' / UTF-8 = b'\xc3\xa9' / 
€: ISO-8859-1 = <unknown> / ISO-8859-15 = b'\xa4' / CP1252 = b'\x80' / UTF-8 = b'\xe2\x82\xac' / 
Ÿ: ISO-8859-1 = <unknown> / ISO-8859-15 = b'\xbe' / CP1252 = b'\x9f' / UTF-8 = b'\xc5\xb8' / 


In [31]:
with open(old_path, encoding='UTF-8') as f:
    # f.__enter__()
    print("Closed ?", f.closed)
    # f.__exit__() => f.close() (with or without exception)
print("Closed ?", f.closed)

Closed ? False
Closed ? True


### read/write with specialized module by format

In [63]:
import pandas as pd
import csv # better pandas
import json # simple jsonify/parser

# import xml.etree as et 
# or external: lxml.etree as et 
import lxml.etree as et 
# or beautifulSoup
# https://realpython.com/python-xml-parser/
from bs4 import BeautifulSoup

In [33]:
dfCities = pd.read_csv(old_path, encoding='UTF-8', dtype={'zip_code':str})
dfCities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39145 entries, 0 to 39144
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   insee_code           39145 non-null  object 
 1   city_code            39145 non-null  object 
 2   zip_code             39145 non-null  object 
 3   label                39145 non-null  object 
 4   latitude             38934 non-null  float64
 5   longitude            38934 non-null  float64
 6   department_name      39145 non-null  object 
 7   department_number    39145 non-null  object 
 8   region_name          39145 non-null  object 
 9   region_geojson_name  39145 non-null  object 
dtypes: float64(2), object(8)
memory usage: 3.0+ MB


In [34]:
dfCities[dfCities.department_number=='01']

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
4605,01008,ambutrix,01500,ambutrix,45.936683,5.332447,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
4611,01019,armix,01510,armix,45.854492,5.583031,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
4613,01041,bettant,01500,bettant,45.937721,5.365980,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
4614,01066,la burbanche,01510,la burbanche,45.859943,5.545818,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
4615,01109,collonges,01550,collonges,46.142544,5.899702,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
...,...,...,...,...,...,...,...,...,...,...
28105,01053,bourg en bresse,01000,bourg en bresse,46.205014,5.245594,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
28152,01165,francheleins,01090,francheleins,46.073642,4.812560,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
28165,01183,guereins,01090,guereins,46.106683,4.782509,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
28200,01258,montceaux,01090,montceaux,46.096201,4.803716,ain,01,auvergne-rhône-alpes,Auvergne-Rhône-Alpes


In [35]:
dfCities[dfCities.department_number.isin(['2A','2B'])]

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
26273,2A017,appietto,20167,appietto,42.003459,8.733865,corse-du-sud,2A,corse,Corse
26274,2A018,arbellara,20110,arbellara,41.671484,8.989411,corse-du-sud,2A,corse,Corse
26275,2A026,azilone ampaza,20190,azilone ampaza,41.870639,9.016697,corse-du-sud,2A,corse,Corse
26276,2A031,bastelica,20119,bastelica,41.988203,9.048771,corse-du-sud,2A,corse,Corse
26277,2A062,carbuccia,20133,carbuccia,42.030469,8.936785,corse-du-sud,2A,corse,Corse
...,...,...,...,...,...,...,...,...,...,...
26687,2B333,vallecalle,20232,vallecalle,42.599687,9.337179,haute-corse,2B,corse,Corse
26688,2B342,ventiseri,20240,ventiseri,41.941136,9.362670,haute-corse,2B,corse,Corse
26689,2B342,ventiseri,20240,ventiseri,41.941136,9.362670,haute-corse,2B,corse,Corse
26690,2B343,venzolasca,20215,venzolasca,42.501558,9.493174,haute-corse,2B,corse,Corse


In [36]:
with open(old_path, encoding='UTF-8') as f:
    data = list(csv.reader(f))
data[:5]

[['insee_code',
  'city_code',
  'zip_code',
  'label',
  'latitude',
  'longitude',
  'department_name',
  'department_number',
  'region_name',
  'region_geojson_name'],
 ['25620',
  'ville du pont',
  '25650',
  'ville du pont',
  '46.999873398',
  '6.498147193',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25624',
  'villers grelot',
  '25640',
  'villers grelot',
  '47.361512085',
  '6.235167025',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25615',
  'villars les blamont',
  '25310',
  'villars les blamont',
  '47.368383721',
  '6.871414913',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté'],
 ['25619',
  'les villedieu',
  '25240',
  'les villedieu',
  '46.713906258',
  '6.26583065',
  'doubs',
  '25',
  'bourgogne-franche-comté',
  'Bourgogne-Franche-Comté']]

#### import/export json

In [40]:
dfCities.to_json('data/cities/cities.json', orient='records')

In [41]:
dfCities.iloc[:10].to_json('data/cities/cities_extract.json', orient='records')

In [47]:
with open('data/cities/cities_extract.json', encoding='UTF-8') as f:
    data = json.load(f)
data

[{'insee_code': '25620',
  'city_code': 'ville du pont',
  'zip_code': '25650',
  'label': 'ville du pont',
  'latitude': 46.999873398,
  'longitude': 6.498147193,
  'department_name': 'doubs',
  'department_number': '25',
  'region_name': 'bourgogne-franche-comté',
  'region_geojson_name': 'Bourgogne-Franche-Comté'},
 {'insee_code': '25624',
  'city_code': 'villers grelot',
  'zip_code': '25640',
  'label': 'villers grelot',
  'latitude': 47.361512085,
  'longitude': 6.235167025,
  'department_name': 'doubs',
  'department_number': '25',
  'region_name': 'bourgogne-franche-comté',
  'region_geojson_name': 'Bourgogne-Franche-Comté'},
 {'insee_code': '25615',
  'city_code': 'villars les blamont',
  'zip_code': '25310',
  'label': 'villars les blamont',
  'latitude': 47.368383721,
  'longitude': 6.871414913,
  'department_name': 'doubs',
  'department_number': '25',
  'region_name': 'bourgogne-franche-comté',
  'region_geojson_name': 'Bourgogne-Franche-Comté'},
 {'insee_code': '25619',
 

In [50]:
# JSON stringify
json.dumps(data)

'[{"insee_code": "25620", "city_code": "ville du pont", "zip_code": "25650", "label": "ville du pont", "latitude": 46.999873398, "longitude": 6.498147193, "department_name": "doubs", "department_number": "25", "region_name": "bourgogne-franche-comt\\u00e9", "region_geojson_name": "Bourgogne-Franche-Comt\\u00e9"}, {"insee_code": "25624", "city_code": "villers grelot", "zip_code": "25640", "label": "villers grelot", "latitude": 47.361512085, "longitude": 6.235167025, "department_name": "doubs", "department_number": "25", "region_name": "bourgogne-franche-comt\\u00e9", "region_geojson_name": "Bourgogne-Franche-Comt\\u00e9"}, {"insee_code": "25615", "city_code": "villars les blamont", "zip_code": "25310", "label": "villars les blamont", "latitude": 47.368383721, "longitude": 6.871414913, "department_name": "doubs", "department_number": "25", "region_name": "bourgogne-franche-comt\\u00e9", "region_geojson_name": "Bourgogne-Franche-Comt\\u00e9"}, {"insee_code": "25619", "city_code": "les vil

### XML

In [57]:
dfCities.iloc[:10].to_xml("data/cities/cities_extract.xml", root_name="cities", row_name="city", index=False)

In [61]:
# NB: list columns obtained with dfCities.columns
dfCities.iloc[:10].to_xml(
    "data/cities/cities_extract2.xml", 
    root_name="cities", 
    row_name="city", 
    attr_cols=['insee_code', 'city_code', 'zip_code', 'label', 'latitude', 'longitude',
       'department_name', 'department_number', 'region_name',
       'region_geojson_name'],
    index=False
)

In [62]:
dfCities.to_xml(
    "data/cities/cities.xml", 
    root_name="cities", 
    row_name="city", 
    attr_cols=['insee_code', 'city_code', 'zip_code', 'label', 'latitude', 'longitude',
       'department_name', 'department_number', 'region_name',
       'region_geojson_name'],
    index=False
)

In [65]:
data2 = et.parse('data/cities/cities.xml')
data2

<lxml.etree._ElementTree at 0x14f4c16dd40>

In [67]:
extract = data2.xpath("//city[@label='valence']")
extract

[<Element city at 0x14f4723d240>, <Element city at 0x14f47efd1c0>]

In [70]:
for city in extract:
    print(et.dump(city))

<city insee_code="26362" city_code="valence" zip_code="26000" label="valence" latitude="44.922561335" longitude="4.913645994" department_name="dr&#xF4;me" department_number="26" region_name="auvergne-rh&#xF4;ne-alpes" region_geojson_name="Auvergne-Rh&#xF4;ne-Alpes"/>
  
None
<city insee_code="16392" city_code="valence" zip_code="16460" label="valence" latitude="45.882271274" longitude="0.317865949" department_name="charente" department_number="16" region_name="nouvelle-aquitaine" region_geojson_name="Nouvelle-Aquitaine"/>
  
None


In [76]:
# With BeautifulSoup
with open('data/cities/cities.xml', encoding='UTF-8') as f:
    soup = BeautifulSoup(f, "lxml-xml")

In [81]:
soup.select("city[label='valence']")

[<city city_code="valence" department_name="drôme" department_number="26" insee_code="26362" label="valence" latitude="44.922561335" longitude="4.913645994" region_geojson_name="Auvergne-Rhône-Alpes" region_name="auvergne-rhône-alpes" zip_code="26000"/>,
 <city city_code="valence" department_name="charente" department_number="16" insee_code="16392" label="valence" latitude="45.882271274" longitude="0.317865949" region_geojson_name="Nouvelle-Aquitaine" region_name="nouvelle-aquitaine" zip_code="16460"/>]

## Re, Regexp, Regular Expression
- pandas: mycolumn.str.match|fullmatch|contains(pattern)
- re: search, match, fullmatch, sub|subn

NB:
- match: pattern at the beginning of the word|line
- fullmatch: pattern describing the whole word|line

In [89]:
dfCities[dfCities.label.str.fullmatch(r"\w+ \w+")]

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
1,25624,villers grelot,25640,villers grelot,47.361512,6.235167,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
3,25619,les villedieu,25240,les villedieu,46.713906,6.265831,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
4,25622,villers buzon,25170,villers buzon,47.228558,5.852187,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
8,02102,bouconville vauclair,02860,bouconville vauclair,49.460193,3.756685,aisne,02,hauts-de-france,Hauts-de-France
10,02124,brissy hamegicourt,02240,brissy hamegicourt,49.742858,3.399924,aisne,02,hauts-de-france,Hauts-de-France
...,...,...,...,...,...,...,...,...,...,...
39116,98805,dumbea,98836,dumbea ga,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39119,98817,le mont dore,98810,mont dore,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39134,98810,kaala gomen,98817,kaala gomen,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39135,98813,la foa,98880,la foa,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie


In [91]:
dfCities[dfCities.label.str.contains(r"toulo.?[ns]")]

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
3610,71542,toulon sur arroux,71320,toulon sur arroux,46.682758,4.147734,saône-et-loire,71,bourgogne-franche-comté,Bourgogne-Franche-Comté
3958,39533,toulouse le chateau,39230,toulouse le chateau,46.821902,5.5832,jura,39,bourgogne-franche-comté,Bourgogne-Franche-Comté
20897,83137,toulon,83200,toulon,43.136537,5.932599,var,83,provence-alpes-côte d'azur,Provence-Alpes-Côte d'Azur
21105,83137,toulon,83100,toulon,43.136537,5.932599,var,83,provence-alpes-côte d'azur,Provence-Alpes-Côte d'Azur
21165,83137,toulon,83000,toulon,43.136537,5.932599,var,83,provence-alpes-côte d'azur,Provence-Alpes-Côte d'Azur
21447,31555,toulouse,31100,toulouse,43.596038,1.432095,haute-garonne,31,occitanie,Occitanie
21449,31555,toulouse,31300,toulouse,43.596038,1.432095,haute-garonne,31,occitanie,Occitanie
21907,31555,toulouse,31400,toulouse,43.596038,1.432095,haute-garonne,31,occitanie,Occitanie
21912,31575,vieille toulouse,31320,vieille toulouse,43.527981,1.438022,haute-garonne,31,occitanie,Occitanie
22025,3286,toulon sur allier,3400,toulon sur allier,46.51096,3.377284,allier,3,auvergne-rhône-alpes,Auvergne-Rhône-Alpes


In [92]:
dfCities[dfCities.label.str.contains(r"on$")]

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
4,25622,villers buzon,25170,villers buzon,47.228558,5.852187,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
39,02583,pancy courtecon,02860,pancy courtecon,49.460356,3.649818,aisne,02,hauts-de-france,Hauts-de-France
89,07274,st maurice en chalencon,07190,st maurice en chalencon,44.854713,4.582379,ardèche,07,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
92,07306,sampzon,07120,sampzon,44.412648,4.331866,ardèche,07,auvergne-rhône-alpes,Auvergne-Rhône-Alpes
110,02559,nouvion et catillon,02270,nouvion et catillon,49.707298,3.493589,aisne,02,hauts-de-france,Hauts-de-France
...,...,...,...,...,...,...,...,...,...,...
38904,97422,le tampon,97430,le tampon,-21.223167,55.558553,la réunion,974,la réunion,La Réunion
38905,97501,miquelon langlade,97500,st pierre et miquelon,46.951612,-56.324420,saint-pierre-et-miquelon,975,saint-pierre-et-miquelon,Saint-Pierre-et-Miquelon
38906,97502,st pierre,97500,st pierre et miquelon,46.779881,-56.189609,saint-pierre-et-miquelon,975,saint-pierre-et-miquelon,Saint-Pierre-et-Miquelon
38907,97501,miquelon langlade,97500,st pierre et miquelon,46.951612,-56.324420,saint-pierre-et-miquelon,975,saint-pierre-et-miquelon,Saint-Pierre-et-Miquelon


In [93]:
cities = list(dfCities.label)
cities[:10]

['ville du pont',
 'villers grelot',
 'villars les blamont',
 'les villedieu',
 'villers buzon',
 'villers la combe',
 'villers sous chalamont',
 'voujeaucourt',
 'bouconville vauclair',
 'bouresches']

In [None]:
# filter les villes qui:
# - avec un accent 

In [96]:
[ city for city in cities if re.search(r"[àéèêëîôùÿ]", city) ]

[]

In [97]:
[ city for city in cities if re.match(r"Saint", city, flags=re.I) ]

['saints en puisaye',
 'saintigny',
 'saintigny',
 'saintes',
 'saintes maries de la mer',
 'saints geosmes',
 'saints geosmes',
 'saintines',
 'saintry sur seine']

In [99]:
[ city for city in cities if re.match(r"St .*bar", city, flags=re.I) ]

['st barthelemy le meil',
 'st barthelemy',
 'st baraing',
 'st barthelemy de vals',
 'st barthelemy le plain',
 'st denis combarnazat',
 'st bardoux',
 'st barthelemy de beaurepaire',
 'st barthelemy de sechilienne',
 'st barthelemy lestra',
 'st georges de baroille',
 'st arcons de barges',
 'st barthelemy',
 'st barnabe',
 'st barthelemy',
 'st christophe des bardes',
 'st cibard',
 'st cybardeaux',
 'st ybard',
 'st barthelemy d agenais',
 'st barthelemy de bussiere',
 'st martial d albarede',
 'st dizier masbaraud',
 'st medard de barbezieux',
 'st just ibarre',
 'st bard',
 'st dizier masbaraud',
 'st barthelemy',
 'st barthelemy de bellegarde',
 'st cernin de labarde',
 'st quentin de baron',
 'st vincent de barbeyrargues',
 'st jean de barrou',
 'st etienne sous barbuise',
 'st andre en barrois',
 'st remy sous barbuise',
 'st barthelemy grozon',
 'st vincent de barres',
 'st barthelemy d anjou',
 'st ybars',
 'st barthelemy',
 'st barthelemy']