# Filesystem

In [30]:
import csv
import pandas as pd

## Read lines with open

In [4]:
f = open('data/cities.csv', encoding='UTF-8')
f

<_io.TextIOWrapper name='data/cities.csv' mode='r' encoding='UTF-8'>

In [5]:
f.read()

'name,population,cp\nPau,77000,64000\nToulouse,477000,31000\nNîmes,150000,30000\n'

In [6]:
f.read()

''

In [7]:
f.close()

In [8]:
f.closed

True

In [20]:
f = open('data/cities.csv', encoding='UTF-8')
for line in f:
    print(line)
f.close()

name,population,cp

Pau,77000,64000

Toulouse,477000,31000

Nîmes,150000,30000



In [12]:
f = open('data/cities_fr.csv', encoding='CP1252')
for line in f:
    print(line)
f.close()

name;population;cp

Pau;77000;64000

Toulouse;477000;31000

Nîmes;150000;30000



In [14]:
city = "Nîmes"
for encoding in "CP1252", "ISO-8859-1", "ISO-8859-15", "UTF-8":
    print(encoding, '->', city.encode(encoding))

CP1252 -> b'N\xeemes'
ISO-8859-1 -> b'N\xeemes'
ISO-8859-15 -> b'N\xeemes'
UTF-8 -> b'N\xc3\xaemes'


In [16]:
word = "€"
for encoding in "CP1252", "ISO-8859-1", "ISO-8859-15", "UTF-8":
    try:
        print(encoding, '->', word.encode(encoding))
    except UnicodeEncodeError:
        print(f"Can't encode '{word}' with encoding '{encoding}'")

CP1252 -> b'\x80'
Can't encode '€' with encoding 'ISO-8859-1'
ISO-8859-15 -> b'\xa4'
UTF-8 -> b'\xe2\x82\xac'


In [17]:
city = "東京"
code = city.encode('UTF-8')
code

b'\xe6\x9d\xb1\xe4\xba\xac'

In [18]:
code.decode('UTF-8')

'東京'

## Split lines

In [28]:
f = open('data/cities_fr.csv', encoding='CP1252')
sep = ";"
next(f)
cities = []
for line in f:
    data = line.rstrip().split(sep)
    data[1] = int(data[1])
    cities.append(data)
f.close()
cities

[['Pau', 77000, '64000'],
 ['Toulouse', 477000, '31000'],
 ['Nîmes', 150000, '30000']]

## Read csv with pandas
pandas 2D array = DataFrame

In [36]:
pd.read_csv?

[1;31mSignature:[0m
[0mpd[0m[1;33m.[0m[0mread_csv[0m[1;33m([0m[1;33m
[0m    [0mfilepath_or_buffer[0m[1;33m:[0m [1;34m'FilePath | ReadCsvBuffer[bytes] | ReadCsvBuffer[str]'[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0msep[0m[1;33m:[0m [1;34m'str | None | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mdelimiter[0m[1;33m:[0m [1;34m'str | None | lib.NoDefault'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mheader[0m[1;33m:[0m [1;34m"int | Sequence[int] | None | Literal['infer']"[0m [1;33m=[0m [1;34m'infer'[0m[1;33m,[0m[1;33m
[0m    [0mnames[0m[1;33m:[0m [1;34m'Sequence[Hashable] | None | lib.NoDefault'[0m [1;33m=[0m [1;33m<[0m[0mno_default[0m[1;33m>[0m[1;33m,[0m[1;33m
[0m    [0mindex_col[0m[1;33m:[0m [1;34m'IndexLabel | Literal[False] | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0musecols[0m[1;33m:

In [37]:
df_cities = pd.read_csv('data/cities.csv', dtype={'cp': 'str'})
df_cities

Unnamed: 0,name,population,cp
0,Pau,77000,64000
1,Toulouse,477000,31000
2,Nîmes,150000,30000
3,Foix,9706,9000


In [38]:
df_cities.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   name        4 non-null      object
 1   population  4 non-null      int64 
 2   cp          4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [39]:
type(df_cities)

pandas.core.frame.DataFrame

In [40]:
df_cities.population

0     77000
1    477000
2    150000
3      9706
Name: population, dtype: int64

In [41]:
df_cities['population']

0     77000
1    477000
2    150000
3      9706
Name: population, dtype: int64

In [44]:
type(df_cities.population)

pandas.core.series.Series

In [42]:
sum(df_cities.population)

713706

In [45]:
total_population = df_cities.population.sum()
min_population = df_cities.population.min()
max_population = df_cities.population.max()
avg_population = df_cities.population.mean()
print(f"population: total={total_population}, min={min_population}, max={max_population}, average={avg_population}")

population: total=713706, min=9706, max=477000, average=178426.5


In [50]:
df_cities.to_json('data/cities.json', orient='records')

## Read csv with module csv

In [59]:
f = open('data/cities.csv', newline='', encoding='UTF-8')
csv_reader = csv.reader(f, delimiter=",")
headers = next(csv_reader)
print("Headers:", headers)
cities = []
for data in csv_reader:
    data_converted = (data[0], int(data[1]), data[2])
    cities.append(data_converted)
    print("Row:", data_converted)
f.close()
cities

Headers: ['name', 'population', 'cp']
Row: ('Pau', 77000, '64000')
Row: ('Toulouse', 477000, '31000')
Row: ('Nîmes', 150000, '30000')
Row: ('Foix', 9706, '09000')


[('Pau', 77000, '64000'),
 ('Toulouse', 477000, '31000'),
 ('Nîmes', 150000, '30000'),
 ('Foix', 9706, '09000')]

In [60]:
f = open('data/cities.csv', newline='', encoding='UTF-8')
csv_reader = csv.reader(f, delimiter=",")
headers = next(csv_reader)
print("Headers:", headers)
cities = []
for name, population, cp in csv_reader:
    data_converted = (name, int(population), cp)
    cities.append(data_converted)
    print("Row:", data_converted)
f.close()
cities

Headers: ['name', 'population', 'cp']
Row: ('Pau', 77000, '64000')
Row: ('Toulouse', 477000, '31000')
Row: ('Nîmes', 150000, '30000')
Row: ('Foix', 9706, '09000')


[('Pau', 77000, '64000'),
 ('Toulouse', 477000, '31000'),
 ('Nîmes', 150000, '30000'),
 ('Foix', 9706, '09000')]

In [63]:
df_cities = pd.DataFrame(cities, columns=headers)
df_cities

Unnamed: 0,name,population,cp
0,Pau,77000,64000
1,Toulouse,477000,31000
2,Nîmes,150000,30000
3,Foix,9706,9000


## All cities in France
source: https://www.data.gouv.fr/fr/datasets/villes-de-france/

In [65]:
df_cities_fr = pd.read_csv('data/cities_gouv.csv')
df_cities_fr

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
0,25620,ville du pont,25650,ville du pont,46.999873,6.498147,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
1,25624,villers grelot,25640,villers grelot,47.361512,6.235167,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
2,25615,villars les blamont,25310,villars les blamont,47.368384,6.871415,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
3,25619,les villedieu,25240,les villedieu,46.713906,6.265831,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
4,25622,villers buzon,25170,villers buzon,47.228558,5.852187,doubs,25,bourgogne-franche-comté,Bourgogne-Franche-Comté
...,...,...,...,...,...,...,...,...,...,...
39140,98829,thio,98829,thio,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39141,98831,voh,98833,voh,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39142,98832,yate,98834,yate,,,nouvelle-calédonie,988,nouvelle-calédonie,Nouvelle Calédonie
39143,98612,sigave,98620,sigave,-14.270411,-178.155263,wallis-et-futuna,986,wallis-et-futuna,Wallis-et-Futuna


In [66]:
df_cities_fr.loc[df_cities_fr.label.str.match('nimes')]

Unnamed: 0,insee_code,city_code,zip_code,label,latitude,longitude,department_name,department_number,region_name,region_geojson_name
21274,30189,nimes,30000,nimes,43.844658,4.347591,gard,30,occitanie,Occitanie
21668,30189,nimes,30900,nimes,43.844658,4.347591,gard,30,occitanie,Occitanie
24177,30189,nimes,30900,nimes,43.844658,4.347591,gard,30,occitanie,Occitanie
