In [4]:
## Extractor de Buletine de Avizare Restrictie (BAR)
## author: Mihai Croicu
## organization: Asociatia Proinfrastructura

#Cauta ultimul BAR de pe site-ul CFR si il descarca automat.
#Apoi converteste BAR-ul intr-un format sane, pe care-l salveaza ca CSV si Excel.

#Iti trebuie curl instalat si disponibil in PATH.
#Chestia asta e facuta sa mearga pe un sistem Linux sau OSX. 
#Ai nevoie de curl si soffice in path. Poti testa deschizand un terminal si tastand soffice && curl
#Pe windows, cat timp merge in powershell curl si soffice, ar *trebui* sa mearga. 
#Daca nu, Ubuntu subsystem for Windows ar trebui sa mearga.

In [9]:
if (!require("pacman")) install.packages("pacman")
pacman::p_load(tidyverse, docxtractr, textreadr, jsonlite)
pacman::p_load(rvest)
#Incarca RVEST la sfarsit. Namespace-ul rvest e busit (R!) asa ca rvest::read_html nu merge
#Si celelalte pachete expun si ele un read_html.

In [2]:
#1:Bucureşti 2:Braşov 3:Cluj 4:Constanţa 5:Craiova 6:Galaţi 7:Iaşi 8:Timişoara
oras = 1 

cfr_web <- read_html('http://cfr.ro/index.php/ct-menu-item-105/ct-menu-item-116')

link <- html_nodes(cfr_web,'.art-button') %>% html_attr('href')
web_bar_bucuresti <- paste0('http://cfr.ro',link[oras])

print(paste0("Fetching ", web_bar_bucuresti))
fetch_command <- paste0('curl ',web_bar_bucuresti,' -o bar.doc')
system(fetch_command)

[1] "Fetching http://cfr.ro/files/buletine-avizare/decada-1-10-03-2021/bucuresti.doc"


In [3]:
print ("Converting MSDOC to DOCX...:")
# If this gives an error, then you NEED
system('soffice --headless --convert-to docx bar.doc')

[1] "Converting MSDOC to DOCX...:"


In [5]:
## Obtine lista de linii

In [7]:
print("Citim restrictiile...")
print("Procesul este relativ lent. Asteptati...")
bucuresti_b0 <- textreadr::read_docx('bar.docx')

[1] "Citim restrictiile..."
[1] "Procesul este relativ lent. Asteptati..."


In [11]:
print("Extragem lista de linii...")

#Liniile sunt definite de un text de forma LINIA(\s|\t)(100-999)[\sA-Z]

lista_linii <- bucuresti_b0[str_detect(bucuresti_b0, "LINIA[:blank:][:digit:]")]

[1] "Extragem lista de linii..."


In [12]:
print("Extragem tabelele de restrictii...")
real_world <- docxtractr::read_docx('bar.docx')
docx_tbl_count(real_world)
# get all the tables
tbls <- docxtractr::docx_extract_all_tbls(real_world, preserve = TRUE, trim = TRUE)

[1] "Extragem tabelele de restrictii..."


In [14]:
lista_linii <- c('',str_replace(lista_linii,'LINIA[:blank:]',''))

In [15]:
lista_linii

In [18]:
if (length(lista_linii) != docx_tbl_count(real_world)) {
    stop("value_erorr : Numarul de tabele de restrictii difera de numarul de capete de tabel")
}

In [None]:
## Obtine restrictiile

In [19]:
scoate_magistrala<-function(index){
    #Extrage magistrala
    tabel_restrictii <- tbls[index][[1]]
    fir1 <- tabel_restrictii%>%filter(V3 != '')%>%select(V2,V3,V4)%>%rename(borne=V2,viteza=V3,nume=V4)
    fir2 <- tabel_restrictii%>%filter(V8 != '')%>%select(V7,V8,V9)%>%rename(borne=V7,viteza=V8,nume=V9)
    fir1['fir'] <- 1
    fir2['fir'] <- 2
    fir <- rbind(fir1,fir2) %>% mutate (viteza=as.integer(viteza))
    return(fir) 
}

In [20]:
identifica_restrictii <- function(fir){
    # :fir - dataframe -> fir[c('borne','viteza','nume','fir')]
    # :return - dataframe -> firRestrictii[c('r_start','r_stop')]
    # Dat fiind un tabel de tip fir, continand un set de borne CFR-istice (ex. 66+291 39+020)
    # Sparge borna in pozitii "start" si "stop", si converteste-le in numerice (floats)
    # Daca nu avem nici o restrictie pe directa, tabelul va fi gol -> return(empty(firRestrictii))
    if(nrow(fir)==0) {
        return(data.frame(r_start = double(),r_end = double())) 
        }
    #Sparge borna in "start" si "stop"
    poz_km <- fir %>% select(borne)
    #Minimal clean-up cu regex.
    poz_km <- poz_km %>% mutate(borne = str_replace_all(borne,'[[:blank:]|\n]\\([a-zA-Z1-9]*[:blank:]*[a-z1-9]*\\)',''))
    poz_km <- poz_km %>% mutate(borne = str_replace_all(borne,'toată linia','0+000\n0+000'))
    poz_km <- ((poz_km%>%select(borne))[[1]]%>%str_split('\\n', simplify=TRUE))
    #Converteste + in . pentru o conversie in float
    poz_km <- poz_km%>%str_split('\\+',simplify=TRUE)
    #Calculeaza restrictia in metri si inapoi in km. Functioneaza si cu 160+0 si cu 160+000.
    poz_km <- (as.integer(poz_km[,1])*1000+as.integer(poz_km[,2]))/1000
    #R-tidy produce o matrice de 2 coloane x 2*nr_restrictii. Sparge in doua si return df.
    nr_restrictii <- length(poz_km)/2
    r_start <- poz_km[1:nr_restrictii]
    r_end <- poz_km[(nr_restrictii+1):length(poz_km)]
    identifica_restrictii <- data.frame(r_start = r_start, r_end = r_end)
}

In [21]:
bar_final = data.frame()
for (i in 2:length(lista_linii)) {
    print(paste(i," Extrag : ",lista_linii[i]))
    fir <- scoate_magistrala(i)
    restrictii <- identifica_restrictii(fir)
    fir['r_start'] <- restrictii['r_start']
    fir['r_end'] <- restrictii['r_end']
    fir['linie'] <- lista_linii[i]
    bar_final = rbind(bar_final,fir)
}


[1] "2  Extrag :  100"
[1] "3  Extrag :  101"
[1] "4  Extrag :  102"
[1] "5  Extrag :  103"
[1] "6  Extrag :  105"
[1] "7  Extrag :  106 A"
[1] "8  Extrag :  107"
[1] "9  Extrag :  107 A"
[1] "10  Extrag :  108"
[1] "11  Extrag :  109"
[1] "12  Extrag :  112"
[1] "13  Extrag :  116"
[1] "14  Extrag :  136"
[1] "15  Extrag :  143"
[1] "16  Extrag :  144"
[1] "17  Extrag :  200"
[1] "18  Extrag :  201"
[1] "19  Extrag :  203"
[1] "20  Extrag :  205"
[1] "21  Extrag :  218"
[1] "22  Extrag :  300"
[1] "23  Extrag :  301 Ba"
[1] "24  Extrag :  301 Bb"
[1] "25  Extrag :  301 D"
[1] "26  Extrag :  301 De"
[1] "27  Extrag :  301 Eb"
[1] "28  Extrag :  301 F"
[1] "29  Extrag :  301 F1"
[1] "30  Extrag :  301 G"
[1] "31  Extrag :  301 J"
[1] "32  Extrag :  301 K"
[1] "33  Extrag :  301 M"
[1] "34  Extrag :  301 N"
[1] "35  Extrag :  301 O"
[1] "36  Extrag :  301 P"
[1] "37  Extrag :  301 X"
[1] "38  Extrag :  301 Z2"
[1] "39  Extrag :  304"
[1] "40  Extrag :  305"
[1] "41  Extrag :  306"
[1] "4

“NAs introduced by coercion”
“NAs introduced by coercion”


[1] "61  Extrag :  801 B"
[1] "62  Extrag :  802"
[1] "63  Extrag :  804"
[1] "64  Extrag :  807"
[1] "65  Extrag :  812"
[1] "66  Extrag :  813"


“NAs introduced by coercion”
“NAs introduced by coercion”


[1] "67  Extrag :  813 A"
[1] "68  Extrag :  814"
[1] "69  Extrag :  818"


In [22]:
bar_final

borne,viteza,nume,fir,r_start,r_end,linie
<chr>,<int>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
2+800 3+200,30,Bucureşti Nord - Bucureştii Noi,1,2.800,3.20,100
26+000 26+050,30,St. Grădinari peste sch. 6 din bretea 2 - 4 - 6 - 8 Cap Y,1,26.000,26.05,100
38+150 38+700,30,Vadu Lat - Zăvestreni,1,38.150,38.70,100
45+000 46+600,70,St. Zăvestreni linia 2 directă și zonă aparate de cale Cap X + Cap Y,1,45.000,46.60,100
47+300 47+400,50,Zăvestreni - Videle,1,47.300,47.40,100
79+700 81+000,50,Olteni - Rădoieşti,1,79.700,81.00,100
84+280 85+540,30,St. Rădoieşti linia 3 directă,1,84.280,85.54,100
90+600 90+650,30,Rădoieşti - Atârnaţi,1,90.600,90.65,100
94+300 94+350,30,Atârnaţi - Roşiori Nord,1,94.300,94.35,100
99+300 99+600,30,St. Roşiori Nord linia 4 directă Cap X,1,99.300,99.60,100


In [26]:
write_json(bar_final, "bar_extras.json")

In [25]:
write_csv(bar_final, "bar_extras.csv")