# <center><a href='https://mybinder.org/v2/gh/fortierq/binder-mp2i/main?urlpath=git-pull%3Frepo%3Dhttps%253A%252F%252Fgithub.com%252Fmp2i-fsm%252Fmp2i-2021%26urlpath%3Dlab%252Ftree%252Fmp2i-2021%252F8_string%252F2_compression%252Fcompression_code.ipynb%26branch%3Dmain'>Code pour le cours Compression <img src=https://mybinder.org/badge.svg></a></center>

## Run-Length Encoding

In [1]:
let rle_code s =
    let ind = ref s.[0] in
    let nb = ref 1 in
    let l = ref [] in
    for i = 1 to String.length s - 1 do
        if !ind <> s.[i] then (
            l := (!ind, !nb)::!l;
            ind := s.[i];
            nb := 1
        )
        else incr nb
    done;
    List.rev ((!ind, !nb)::!l)

val rle_code : string -> (char * int) list = <fun>


In [2]:
let rec rle_decode = function
    | [] -> ""
    | (c, n)::q -> (String.make n c)^rle_decode q

val rle_decode : (char * int) list -> string = <fun>


## Codage de Huffman

### Construction de l'arbre de Huffman

In [3]:
let get_frequences text =
    let freq = Array.make 256 0 in
    for i = 0 to String.length text - 1 do
        freq.(Char.code text.[i]) <- freq.(Char.code text.[i]) + 1
    done;
    freq

val get_frequences : string -> int array = <fun>


In [4]:
type 'a tree = F of 'a | N of 'a tree * 'a tree

type 'a tree = F of 'a | N of 'a tree * 'a tree


On va utiliser la file de priorité suivante :

In [5]:
module Q = struct
  type 'a t = E | N of 'a * 'a t * 'a t
  let empty () = ref E
  let is_empty t = !t = E
  let add x t =
    let rec aux = function
      | E -> N(x, E, E)
      | N(r, g, d) -> if x < r then N(r, aux g, d) else N(r, g, aux d) in
    t := aux !t
  let take_min t = 
    let rec aux = function
      | E -> failwith "take_min"
      | N(r, g, d) -> if g = E then r, d 
                      else 
                        let m, g = aux g in
                        m, N(r, g, d) in
    let m, t_ = aux !t in 
    t := t_;
    m
end

module Q :
  sig
    type 'a t = E | N of 'a * 'a t * 'a t
    val empty : unit -> 'a t ref
    val is_empty : 'a t ref -> bool
    val add : 'a -> 'a t ref -> unit
    val take_min : 'a t ref -> 'a
  end


In [6]:
let make_huffman_tree freq =
    let q = Q.empty () in
    let n = ref 0 in
    for i = 0 to 255 do 
        if freq.(i) > 0 then (
            incr n;
            Q.add (freq.(i), F(Char.chr i)) q
        )
    done;
    for _ = 0 to !n - 2 do
        let f1, t1 = Q.take_min q in
        let f2, t2 = Q.take_min q in
        Q.add (f1 + f2, N(t1, t2)) q
    done;
    snd (Q.take_min q)

val make_huffman_tree : int array -> char tree = <fun>


In [7]:
let make_table t =
    let codes = Array.make 256 [] in
    let rec aux path = function
        | F(c) -> codes.(Char.code c) <- List.rev path
        | N(g, d) -> aux (0::path) g; aux (1::path) d in
    aux [] t;
    codes

val make_table : char tree -> int list array = <fun>


### Compression

In [8]:
let compress_huffman text =
    let freq = get_frequences text in
    let t = make_huffman_tree freq in
    let table = make_table t in
    let rec aux i = 
        if i = String.length text then []
        else table.(Char.code text.[i]) @ aux (i + 1) in
    aux 0

val compress_huffman : string -> int list = <fun>


In [9]:
let coded_text = compress_huffman "Un algorithme est une suite finie et non ambiguë d'instructions et d'opérations permettant de résoudre une classe de problèmes. Le mot algorithme vient d'Al-Khwârizmî (en arabe : الخوارزمي)2, nom d'un mathématicien persan du IXe siècle. Le domaine qui étudie les algorithmes est appelé l'algorithmique. On retrouve aujourd'hui des algorithmes dans de nombreuses applications telles que le fonctionnement des ordinateurs3, la cryptographie, le routage d'informations, la planification et l'utilisation optimale des ressources, le traitement d'images, le traitement de textes, la bio-informatique, etc. "

val coded_text : int list =
  [0; 0; 1; 0; 1; 1; 0; 0; 1; 0; 0; 1; 1; 1; 0; 0; 0; 1; 1; 0; 1; 1; 1; 0; 1;
   0; 1; 1; 1; 1; 0; 0; 0; 0; 0; 1; 1; 1; 1; 1; 1; 0; 1; 1; 1; 1; 0; 1; 0; 1;
   1; 1; 1; 1; 1; 1; 0; 0; 1; 0; 1; 0; 1; 0; 0; 0; 1; 0; 0; 0; 0; 1; 1; 1; 0;
   1; 1; 0; 0; 1; 1; 1; 0; 0; 0; 0; 1; 1; 0; 1; 0; 1; 0; 0; 0; 0; 0; 1; 1; 1;
   1; 0; 0; 1; 0; 1; 1; 1; 1; 0; 1; 0; 1; 0; 1; 0; 0; 1; 1; 0; 0; 0; 0; 1; 1;
   0; 1; 1; 0; 0; 1; 1; 1; 0; 1; 1; 0; 1; 0; 1; 0; 0; 0; 1; 0; 1; 1; 0; 1; 1;
   0; 0; 0; 0; 1; 1; 0; 0; 0; 0; 0; 0; 1; 1; 1; 0; 0; 0; 1; 1; 0; 1; 1; 0; 0;
   1; 1; 1; 0; 0; 0; 0; 0; 1; 0; 1; 1; 0; 1; 1; 1; 1; 0; 1; 1; 1; 0; 0; 1; 0;
   1; 0; 1; 1; 0; 1; 1; 1; 0; 0; 1; 0; 1; 1; 0; 0; 1; 0; 1; 0; 0; 0; 1; 1; 1;
   0; 1; 1; 0; 1; 1; 0; 0; 1; 1; 0; 0; 0; 1; 1; 1; 0; 1; 1; 1; 1; 1; 1; 1; 1;
   1; 0; 0; 1; 0; 1; 0; 1; 0; 1; 1; 0; 1; 1; 0; 1; 1; 0; 0; 0; 0; 0; 0; 1; 1;
   0; 0; 0; 1; 1; 0; 0; 0; 1; 0; 1; 1; 0; 1; 1; 0; 0; 1; 0; 1; 0; 0; 0; 1;
   ...]


In [10]:
(List.length coded_text) / 8 (* nombre d'octets *)

- : int = 358


### Décompression

In [11]:
let rec decode_huffman t code =
    let rec read_char t l = match t, l with
        | F(c), _ -> c, l
        | N(g, d), 0::q -> read_char g q
        | N(g, d), 1::q -> read_char d q
        | _ -> failwith "codage incorrect" in
    if code = [] then ""
    else let c, l = read_char t code in
    (String.make 1 c)^decode_huffman t l

val decode_huffman : char tree -> int list -> string = <fun>


In [12]:
decode_huffman t coded_text

error: compile_error

On est passé à une taille de 620 octets à 358, soit un gain de $42$% :

In [13]:
1. -. 358./.620.

- : float = 0.422580645161290347


Pour pouvoir décompresser, il faut aussi stocker l'arbre de Huffman dans un fichier, en le sérialisant (transformation en chaîne de caractères) :

In [14]:
let rec serialize_tree = function
| F c -> ['*'; c]
| N (g, d) -> '#'::(serialize_tree g)@serialize_tree d

val serialize_tree : char tree -> char list = <fun>


In [15]:
let t_serial = serialize_tree t

error: compile_error

In [16]:
let deserialize l = 
    let rec read_tree = function
        | '*'::c::q -> F(c), q
        | '#'::q -> let g, q1 = read_tree q in
                    let d, q2 = read_tree q1 in
                    N(g, d), q2 
        | _ -> failwith "deserialize" in
    fst (read_tree l) in
deserialize t_serial

error: compile_error

## LZW

In [17]:
let code s =
  let n = String.length s in
  let d = Hashtbl.create 13 in
  let dinv = Hashtbl.create 13 in
  let max_code = ref 0 in
  let add_code k v =
      Hashtbl.add d k v;
      Hashtbl.add dinv v k in
  for i = 0 to n - 1 do (* initialize d *)
    if not (Hashtbl.mem d (String.sub s i 1)) then (
      add_code (String.sub s i 1) !max_code;
      incr max_code
    )
  done;
  let rec aux i = (* returns compression of s[i:] *)
    if i >= String.length s then []
    else
      let rec max_suffix j =
        if j = n || not (Hashtbl.mem d (String.sub s i (j - i + 1))) then j
        else max_suffix (j + 1) in
      let j = max_suffix i in
      let w = String.sub s i (j - i) in
      if j < n then (
        let w' = String.sub s i (j - i + 1) in
        add_code w' (!max_code + 1);
        incr max_code;
      );
      (Hashtbl.find d w)::aux j
  in
  aux 0, dinv

val code : string -> int list * (int, string) Hashtbl.t = <fun>


In [18]:
let c, dinv = code "barbapapaba"

val c : int list = [0; 1; 2; 5; 3; 1; 9; 5]
val dinv : (int, string) Hashtbl.t = <abstr>


In [19]:
let rec lzw_decode = function
    | [] -> ""
    | e::q -> (Hashtbl.find dinv e)^lzw_decode q

val lzw_decode : int list -> string = <fun>


In [20]:
lzw_decode c

- : string = "barbapapaba"
