# Converting PDFs to text files and moving them to a new directory

## Lister les fichiers dans `data/pdf`

In [20]:
from os import listdir
from os.path import isfile, join

pdf_path = '../data/pdf'

pdfs = [f for f in listdir(pdf_path) if isfile(join(pdf_path, f))]

In [21]:
pdfs[:3]

['Bxl_1851_Tome_I1_Part_2.pdf',
 'Bxl_1851_Tome_I1_Part_3.pdf',
 'Bxl_1851_Tome_I1_Part_1.pdf']

In [22]:
len(pdfs)

100

## Installer PdfToText (MacOS)

Sur le terminal, lancer :

```
ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)" < /dev/null 2> /dev/null ; brew install caskroom/cask/brew-cask 2> /dev/null
```

Et puis :
```
brew cask install pdftotext
```

## Convertir les PDFs en TXT

In [23]:
import subprocess

In [24]:
from pathlib import Path

txt_path = '../data/txt'

Path(txt_path).mkdir(parents=True, exist_ok=True)

In [25]:
txts = [f for f in listdir(txt_path) if isfile(join(txt_path,f))]

In [26]:
not_converted_pdfs = [p for p in pdfs if f'{p[:-3]}txt' not in txts]

In [27]:
len(pdfs)

100

In [28]:
len(txts)

0

In [30]:
len(not_converted_pdfs)

100

In [31]:
for pdf in pdfs:
    command = [
        'pdftotext',
        f'{pdf_path}/{pdf}',
        f'{txt_path}/{pdf[:-3]}txt'
    ]
    subprocess.run(command, capture_output=True)
    print(f"{pdf} converted!")
    

Bxl_1851_Tome_I1_Part_2.pdf converted!
Bxl_1851_Tome_I1_Part_3.pdf converted!
Bxl_1851_Tome_I1_Part_1.pdf converted!
Bxl_1852_Tome_I1_Part_5.pdf converted!
Bxl_1852_Tome_I1_Part_4.pdf converted!
Bxl_1851_Tome_I1_Part_4.pdf converted!
Bxl_1852_Tome_I1_Part_1.pdf converted!
Bxl_1851_Tome_I1_Part_5.pdf converted!
Bxl_1852_Tome_I1_Part_3.pdf converted!
Bxl_1852_Tome_I1_Part_2.pdf converted!
Bxl_1857_Tome_II1_Part_1.pdf converted!
Bxl_1850_Tome_II1_Part_4.pdf converted!
Bxl_1854_Tome_II1_Part_3.pdf converted!
Bxl_1854_Tome_I1_Part_3.pdf converted!
Bxl_1851_Tome_II1_Part_5.pdf converted!
Bxl_1849_Tome_I1_Part_2.pdf converted!
Bxl_1855_Tome_II1_Part_2.pdf converted!
Bxl_1849_Tome_II1_Part_4.pdf converted!
Bxl_1849_Tome_II1_Part_5.pdf converted!
Bxl_1855_Tome_II1_Part_3.pdf converted!
Bxl_1849_Tome_I1_Part_3.pdf converted!
Bxl_1856_Tome_II1_Part_1.pdf converted!
Bxl_1851_Tome_II1_Part_4.pdf converted!
Bxl_1854_Tome_I1_Part_2.pdf converted!
Bxl_1854_Tome_II1_Part_2.pdf converted!
Bxl_1850_Tome_

## Consolider tous les fichiers en un (Bash)

In [32]:
!cat ../data/txt/*.txt > ../data/all.txt

## Compter le nombre de mots

In [33]:
!wc ../data/all.txt

  147140 4875580 21138004 ../data/all.txt
