# Data analysis

In [133]:
import pandas as pd
import numpy as np
import tqdm
import subprocess
import torch
import collections
import matplotlib.pyplot as plt
import os

## Functions to read and write text/tokens

In [2]:
def get_stream_size(stream):
    result = sum(1 for _ in stream)
    stream.seek(0)
    return result

def read_token_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip().split()
                out.append(tokens)
    return out

def read_text_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip()
                out.append(tokens)
    return out

def write_text_from_tokens(tokens, output_file):
    with open(output_file, 'w+') as out_stream:
        for token in tokens:
            out_stream.write(' '.join(token) + '\n')
    

### Corpus Information Functions

In [165]:
reg = r"[\w]+|[.,!?;:'()\[\]{}\"]"

In [166]:
len([word for sentence in en_aligned_text_file for word in re.findall(reg, sentence)])

205374

In [168]:
[len(re.findall(reg, sentence)) for sentence in en_aligned_text_file]

[11,
 35,
 7,
 14,
 24,
 6,
 35,
 12,
 24,
 21,
 9,
 12,
 9,
 31,
 26,
 31,
 5,
 10,
 20,
 21,
 14,
 16,
 21,
 24,
 8,
 14,
 29,
 15,
 30,
 20,
 27,
 32,
 15,
 34,
 23,
 27,
 29,
 10,
 20,
 19,
 4,
 6,
 13,
 4,
 23,
 5,
 4,
 10,
 38,
 17,
 9,
 8,
 23,
 4,
 23,
 29,
 5,
 34,
 14,
 12,
 23,
 30,
 36,
 7,
 29,
 6,
 8,
 34,
 11,
 15,
 24,
 10,
 13,
 16,
 29,
 13,
 13,
 3,
 6,
 10,
 38,
 10,
 10,
 6,
 41,
 9,
 21,
 15,
 27,
 61,
 32,
 28,
 13,
 23,
 18,
 19,
 45,
 33,
 4,
 11,
 27,
 18,
 20,
 31,
 14,
 5,
 33,
 23,
 9,
 8,
 19,
 17,
 4,
 8,
 36,
 8,
 7,
 12,
 16,
 25,
 9,
 20,
 34,
 9,
 25,
 34,
 22,
 18,
 29,
 8,
 12,
 6,
 14,
 10,
 11,
 16,
 21,
 14,
 17,
 22,
 30,
 32,
 35,
 32,
 15,
 16,
 28,
 6,
 5,
 29,
 28,
 12,
 25,
 29,
 11,
 6,
 11,
 23,
 9,
 18,
 14,
 15,
 18,
 22,
 15,
 8,
 26,
 15,
 39,
 14,
 8,
 28,
 20,
 15,
 7,
 6,
 9,
 23,
 8,
 3,
 21,
 30,
 17,
 17,
 22,
 30,
 22,
 35,
 17,
 8,
 35,
 30,
 16,
 10,
 21,
 14,
 16,
 17,
 20,
 23,
 29,
 25,
 15,
 5,
 32,
 12,
 27,
 33,
 12,
 7

In [175]:
class corpus_information():
    def __init__(self, corpus, language, dataset_name, n_most_common=20, remove_punctuation=False):
        self.corpus = corpus
        self.language = language
        self.dataset_name = dataset_name
        self.n_most_common = n_most_common
        self.remove_punctuation = remove_punctuation
        self.counter = self._counter_corpus()
        self.word_in_sequence = self._word_in_sequence()
        self.count_words = self._count_words()
        self.max_words_in_sequence = self._max_words_in_sequence()
        self.mean_words_in_sequence = self._mean_words_in_sequence()
        self.count_unique_words = self._count_unique_words()
        self.most_common_words = self._most_common_words()
        self.count_sequences = self._count_sequences()
        self.sequences_length = self._sequences_length()
        self.max_sequences_length = self._max_sequences_length()
        self.mean_sequences_length = self._mean_sequences_length()

    def _counter_corpus(self):
        reg = r"[\w]+|[.,!?;:'\'()\[\]{}\"]"
        if self.remove_punctuation:
            reg = r'\w+'
        return collections.Counter([word for sentence in self.corpus for word in re.findall(reg, sentence)])
    
    def _word_in_sequence(self):
        reg = r"[\w]+|[.,!?;:'\'()\[\]{}\"]"
        if self.remove_punctuation:
            reg = r'\w+'
        return [len(re.findall(reg, sentence)) for sentence in self.corpus]
    
    def _count_words(self):
        return np.sum(self.word_in_sequence)
    
    def _max_words_in_sequence(self):
        return np.max(self.word_in_sequence)
    
    def _mean_words_in_sequence(self):
        return np.mean(self.word_in_sequence)

    def _count_unique_words(self):
        return len(self.counter)
    
    def _most_common_words(self):
        return list(zip(*self.counter.most_common(self.n_most_common)))[0]        
    
    def _count_sequences(self):
        return len(self.corpus)
    
    def _sequences_length(self):
        return [len(sentence) for sentence in self.corpus]
    
    def _max_sequences_length(self):
        return np.max(self.sequences_length)
    
    def _mean_sequences_length(self):
        return np.mean(self.sequences_length)
    
    def show_informations(self):
        print(self.dataset_name+': ')
        print(f'{self.count_words} {self.language} words.')
        print(f'{self.count_unique_words} unique {self.language} words.')
        print(f'{self.n_most_common} Most common words in the {self.dataset_name} :')
        print('"' + '" "'.join(self.most_common_words) + '"')
        print(f'{self.count_sequences} sequences in {self.dataset_name}')
        print(f'The longest sequence as a length of {self.max_sequences_length}.')
        print('The mean sequence length is {:.2f}.'.format(self.mean_sequences_length))
        print(f'The sequence with the most words as {self.max_words_in_sequence} words.')
        print('The mean word per sequence is {:.2f}.'.format(self.mean_words_in_sequence))
        print('\n')    
        
    def histogram_sequences_length(self, output_path,  bin_number=100):
        plt.hist(self.sequences_length, bins=bin_number, range=[0,400])
        plt.title(f"Sequences length of {self.dataset_name}")
        plt.xlabel("Sequence length")
        plt.ylabel("Frequency")
        plt.savefig(output_path)
        plt.close()       

### Tokenized alligned texts

In [46]:
en_token_file = read_token_file('../data/train.lang1')
fr_token_file = read_token_file('../data/train.lang2')
en_aligned_text_file = [' '.join(word) for word in en_token_file]
fr_aligned_text_file = [' '.join(word) for word in fr_token_file]

### Aligned texts information

In [176]:
en_aligned_info = corpus_information(en_aligned_text_file, 'English', 'English Aligned DataSet')
fr_aligned_info = corpus_information(fr_aligned_text_file, 'French', 'French Aligned DataSet')
en_aligned_info.show_informations()
fr_aligned_info.show_informations()

English Aligned DataSet: 
205374 English words.
13524 unique English words.
20 Most common words in the English Aligned DataSet :
"the" "of" "to" "and" "in" "a" "is" "that" "i" "it" "we" "this" "for" "on" "be" "'" "are" "not" "have" "s"
11000 sequences in English Aligned DataSet
The longest sequence as a length of 512.
The mean sequence length is 105.17.
The sequence with the most words as 96 words.
The mean word per sequence is 18.67.


French Aligned DataSet: 
260371 French words.
17960 unique French words.
20 Most common words in the French Aligned DataSet :
"." "'" "de" "," "la" "l" "et" "le" "à" "les" "des" "que" "d" "est" "en" "un" "une" "du" "pour" "a"
11000 sequences in French Aligned DataSet
The longest sequence as a length of 562.
The mean sequence length is 123.90.
The sequence with the most words as 120 words.
The mean word per sequence is 23.67.




### Save sequences length histograms

In [156]:
en_aligned_info.histogram_sequences_length('data_set_analysis/en_aligned_sequence_len_histogram.png')
fr_aligned_info.histogram_sequences_length('data_set_analysis/fr_aligned_sequence_len_histogram.png')

### Not tokenized and not alligned texts

In [5]:
en_text_file = read_text_file('../data/unaligned.en')
fr_text_file = read_text_file('../data/unaligned.fr')

### Unaligned text information

In [177]:
en_unaligned_info = corpus_information(en_text_file, 'English', 'English Unaligned DataSet')
fr_unaligned_info = corpus_information(fr_text_file, 'French', 'French Unaligned DataSet')
en_unaligned_info.show_informations()
fr_unaligned_info.show_informations()

English Unaligned DataSet: 
9795581 English words.
67571 unique English words.
20 Most common words in the English Unaligned DataSet :
"the" "." "," "to" "of" "and" "in" "a" "is" "that" "I" "for" "'" "this" "be" "we" "on" "it" "are" "have"
474000 sequences in English Unaligned DataSet
The longest sequence as a length of 788.
The mean sequence length is 106.46.
The sequence with the most words as 156 words.
The mean word per sequence is 20.67.


French Unaligned DataSet: 
11158476 French words.
86040 unique French words.
20 Most common words in the French Unaligned DataSet :
"." "'" "de" "," "la" "l" "et" "le" "à" "les" "des" "que" "d" "est" "en" "un" "une" "du" "pour" "nous"
474000 sequences in French Unaligned DataSet
The longest sequence as a length of 969.
The mean sequence length is 119.91.
The sequence with the most words as 184 words.
The mean word per sequence is 23.54.




### Save sequences length histograms

In [158]:
en_unaligned_info.histogram_sequences_length('data_set_analysis/en_unaligned_sequence_len_histogram.png')
fr_unaligned_info.histogram_sequences_length('data_set_analysis/fr_unaligned_sequence_len_histogram.png')