In [1]:
from utils import load_data

In [2]:
# Chargement des données
df = load_data("./data/train.txt")
X = df["text"]

# Caractères
On cherche à analyser les caractères utilisés pour voir si on ne peut pas nettoyer le texte en le débarrassant de caractères inutiles.

In [3]:
from collections import Counter

In [4]:
counter = Counter([c for doc in X for c in doc])
counter

Counter({' ': 3452797,
         'I': 62484,
         'T': 36018,
         'h': 695791,
         'e': 1814180,
         'i': 937505,
         'm': 343676,
         'p': 327313,
         'o': 1128626,
         'r': 771923,
         't': 1334123,
         'a': 1041637,
         'n': 984804,
         'c': 431518,
         'd': 472773,
         'u': 416545,
         'l': 570986,
         'y': 324570,
         'f': 283569,
         'v': 166801,
         'g': 268564,
         's': 866695,
         ',': 127352,
         'E': 6669,
         'b': 196444,
         'k': 106117,
         'w': 258693,
         '.': 158991,
         'A': 18244,
         'W': 8631,
         'S': 14003,
         'F': 13330,
         'x': 28415,
         'z': 9470,
         'j': 33452,
         '-': 4735,
         'U': 2180,
         'M': 6256,
         'B': 9426,
         'O': 7328,
         'q': 8209,
         'N': 5123,
         "'": 25603,
         'D': 2466,
         'L': 3176,
         'Y': 3990,
         '/': 112

On n'a pas de caractères bizarres tout est ASCII. Par contre l'utilisation de certains caractères peut être abusive notamment en ce qui concerne les caractères spéciaux.

In [5]:
# Recherche de documents comportant des caractères spéciaux
def find_example(char: str, window: int = 16, index=False):
    res = []
    for idx, doc in enumerate(X):
        occ = [i for i, c in enumerate(doc) if c == char]
        for i in occ:
            ext = doc[max(0, i-window):i+window+1]
            if index:
                ext = (idx, ext)
            res.append(ext)
    return res

In [6]:
find_example("'", index=True)[3]

(6, " the statement '' Young people no")

In [7]:
find_example('+')[0]

"ave learned ` 1 + 1 = 2 ' . we kn"

In [8]:
find_example('*')[3]

'. Conclusion : ************** I d'

In [9]:
find_example('=')[0]

"learned ` 1 + 1 = 2 ' . we know n"

In [10]:
find_example('#')[0]

' are as below : # Ideas and conce'

In [11]:
find_example('~')[0:3]

['pproximately 20 ~ 30 % of their b',
 ', If you are 50 ~ 60 years old yo',
 '< < < < < __ -- ~ ~ -- __The End ']

In [12]:
find_example('|')[0]

' later life you | ll not be able '

In [13]:
find_example('[')[0]

'be the best age [ use of young pe'

In [14]:
find_example('{')[0]

' do not pollute { like sun , wind'

In [15]:
find_example('_', index=True)[0:3]

[(1762, 'icles and roller_skates to go to '),
 (2397, ' not during week_end and also the'),
 (3065, '< < < < < < < < __ -- ~ ~ -- __Th')]

In [16]:
X[3065][-75:]

' < < < < < < < < < __ -- ~ ~ -- __The End __ -- ~ ~ -- __ > > > > > > > > >'

In [17]:
X[4770][:100]

" `` TRY AND TRY UNTILL U SUCCED '' ____________________________________________________ - i agree wi"

In [18]:
find_example('\\')[0]

"have adapt to t \\ people 's expec"

In [19]:
find_example('`')[3]

' a saying that `` teach him ` how'

In [20]:
find_example('´')

['g something won ´ t be useless bu']

In [21]:
find_example('^')[0]

'dfather sayed : ^ Give a smile an'

In [22]:
find_example('%')[0]

'ical records.70 % of the people r'

In [23]:
import re

def correct_special_chars(doc: str) -> str:
    # Harmonisation de la ponctuation
    doc = re.sub(r"[\[{]", "(", doc)
    doc = re.sub(r"[\]}]", ")", doc)
    
    # Suppression des caractères spéciaux
    doc = re.sub(r"[+*=#~^|_\\'`´]", "", doc)
    
    return doc

In [24]:
df["text"].apply(correct_special_chars)

0        IThe importance and popularity of travelling ...
1        It is an important decision , how to plan you...
2        Some people believe that young people can enj...
3        Travelling is usually considered as good recr...
4        i agree that . Life is a person live period o...
                              ...                        
9895     Nowadays , more and more people go abroad , n...
9896     In accomplishing something that is risky come...
9897     At the beginning of the 21st century , the in...
9898     The number of cars in use across the world ha...
9899     Many people think it is betters to have borad...
Name: text, Length: 9900, dtype: object

# Tokenization des nombres

In [25]:
count = 0
for doc in X:
    if re.match(".*\d.*", doc) is not None:
        count += 1
count

1873

Il y a `1873/9900 = 19%` de documents avec des nombres. Est-ce qu'on ne pourrait pas tokenizer les nombres ?

In [26]:
def tokenize_numbers(doc: str) -> str:
    return re.sub("\d+", "#NUMBER", doc)

In [27]:
df["text"].apply(tokenize_numbers)

0        IThe importance and popularity of travelling ...
1        It is an important decision , how to plan you...
2        Some people believe that young people can enj...
3        Travelling is usually considered as good recr...
4        i agree that . Life is a person live period o...
                              ...                        
9895     Nowadays , more and more people go abroad , n...
9896     In accomplishing something that is risky come...
9897     At the beginning of the #NUMBERst century , t...
9898     The number of cars in use across the world ha...
9899     Many people think it is betters to have borad...
Name: text, Length: 9900, dtype: object