# Data analysis

In [56]:
import pandas as pd
import numpy as np
import tqdm
import subprocess
import torch
import collections

## Functions to read and write text/tokens

In [2]:
def get_stream_size(stream):
    result = sum(1 for _ in stream)
    stream.seek(0)
    return result

def read_token_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip().split()
                out.append(tokens)
    return out

def read_text_file(file_name: str):
    out = []
    with open(file_name, 'r') as stream:
            file_size = get_stream_size(stream)
            for line in stream:
                tokens = line.strip()
                out.append(tokens)
    return out

def write_text_from_tokens(tokens, output_file):
    with open(output_file, 'w+') as out_stream:
        for token in tokens:
            out_stream.write(' '.join(token) + '\n')
    

### Corpus Information Functions

In [100]:
class corpus_information():
    def __init__(self, corpus, language, dataset_name, n_most_common=20, remove_punctuation=False):
        self.corpus = corpus
        self.language = language
        self.dataset_name = dataset_name
        self.n_most_common = n_most_common
        self.remove_punctuation = remove_punctuation
        self.counter = self._counter_corpus()
        self.count_words = self._count_words()
        self.count_unique_words = self._count_unique_words()
        self.most_common_words = self._most_common_words()
        self.count_sequences = self._count_sequences()
        self.max_sequences_length = self._max_sequences_length()
        self.mean_sequences_length = self._mean_sequences_length()

    def _counter_corpus(self):
        reg = r"[\w']+|[.,!?;:'()\[\]{}\"]"
        if self.remove_punctuation:
            reg = r'\w+'
        return collections.Counter([word for sentence in self.corpus for word in re.findall(reg, sentence)])
    
    def _count_words(self):
        reg = r"[\w']+|[.,!?;:'()\[\]{}\"]"
        if self.remove_punctuation:
            reg = r'\w+'
        return len([word for sentence in self.corpus for word in re.findall(reg, sentence)])

    def _count_unique_words(self):
        return len(self.counter)
    
    def _most_common_words(self):
        return list(zip(*self.counter.most_common(self.n_most_common)))[0]
    
    def _count_sequences(self):
        return len(self.corpus)
    
    def _max_sequences_length(self):
        return np.max([len(sentence) for sentence in self.corpus])
    
    def _mean_sequences_length(self):
        return np.mean([len(sentence) for sentence in self.corpus])
    
    def show_informations(self):
        print(self.dataset_name+': ')
        print(f'{self.count_words} {self.language} words.')
        print(f'{self.count_unique_words} unique {self.language} words.')
        print(f'{self.n_most_common} Most common words in the {self.dataset_name} :')
        print('"' + '" "'.join(self.most_common_words) + '"')
        print(f'{self.count_sequences} sequences in {self.dataset_name}')
        print(f'The longest sequence as a length of {self.max_sequences_length}.')
        print('The mean sequence length is {:.2f}.'.format(self.mean_sequences_length))
        print('\n')
        
        
    
    

### Tokenized alligned texts

In [46]:
en_token_file = read_token_file('../data/train.lang1')
fr_token_file = read_token_file('../data/train.lang2')
en_aligned_text_file = [' '.join(word) for word in en_token_file]
fr_aligned_text_file = [' '.join(word) for word in fr_token_file]

### Aligned texts information

In [101]:
en_aligned_info = corpus_information(en_aligned_text_file, 'English', 'Aligned English DataSet')
fr_aligned_info = corpus_information(fr_aligned_text_file, 'French', 'Aligned French DataSet')
en_aligned_info.show_informations()
fr_aligned_info.show_informations()

Aligned English DataSet: 
203498 English words.
13530 unique English words.
20 Most common words in the Aligned English DataSet :
"the" "of" "to" "and" "in" "a" "is" "that" "i" "it" "we" "this" "for" "on" "be" "are" "not" "have" "you" "with"
11000 sequences in Aligned English DataSet
The longest sequence as a length of 512.
The mean sequence length is 105.17.


Aligned French DataSet: 
249567 French words.
17988 unique French words.
20 Most common words in the Aligned French DataSet :
"." "de" "," "la" "et" "le" "à" "les" "des" "l'" "que" "est" "en" "d'" "un" "une" "du" "pour" "a" "qui"
11000 sequences in Aligned French DataSet
The longest sequence as a length of 562.
The mean sequence length is 123.90.




### Not tokenized and not alligned texts

In [5]:
en_text_file = read_text_file('../data/unaligned.en')
fr_text_file = read_text_file('../data/unaligned.fr')

In [95]:
en_unaligned_info = corpus_information(en_text_file, 'English', 'Unaligned English DataSet')
fr_unaligned_info = corpus_information(fr_text_file, 'French', 'Unaligned French DataSet')
en_unaligned_info.show_informations()
fr_unaligned_info.show_informations()

Unaligned English DataSet: 
9648962 English words.
74560 unique English words.
20 Most common words in the Unaligned English DataSet :
"the" "." "," "to" "of" "and" "in" "a" "is" "that" "I" "for" "this" "be" "on" "we" "it" "are" "have" "not"
474000 sequences in Unaligned English DataSet
The longest sequence as a length of 788.
The mean sequence length is 106.46.


Unaligned French DataSet: 
10244614 French words.
97985 unique French words.
20 Most common words in the Unaligned French DataSet :
"." "de" "," "la" "et" "le" "à" "les" "des" "que" "en" "est" "du" "un" "une" "pour" "nous" "dans" "pas" "qui"
474000 sequences in Unaligned French DataSet
The longest sequence as a length of 969.
The mean sequence length is 119.91.


