In [15]:
from collections import Counter
import nltk.tokenize
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
import pandas as pd
import re
import string

In [45]:
def countInFile(filename, output):
    """ Counts the number of occurences of each word in a text file.
        Return a Counter object.
    """
    counter = Counter()
    f = open(filename)
    line = f.readline()
    i = 0
    while line:
        for token in word_tokenize(line[1:-5]):
            if re.findall("[a-zA-Z]",token):
                counter[token] += 1
        if i % 100 == 0:
            print("Read file. Line: {:,d}".format(i),end="\r",flush=True)
        i += 1
        line = f.readline()
    f.close()
    i = 1
    f = open(output, "w")
    for word, freq in counter.most_common(1000):
        f.write("{}\t{}\t{}\n".format(word,i,freq))
        if i % 50 == 0:
            print("Write file. Line: {}/1000".format(i),end="\r",flush=True)
        i+=1
    f.close()

In [31]:
#count the words in train_posts.csv
c = countInFile("train_posts.csv", "freq.txt")

Write file. Line: 1000/1000

----------------

In [2]:
def test_repetition(token):
    """ this function tests if a token contain a character that is repeted 3 times or more.
        if no it returns False
        if yes it return a list of tuples of substring of the original token and if it was repeated
        For example, for the token "tomorrrroooooow" it will returns : 
            [('tomo', False), ('r', True), ('o', True), ('w', False)]
    """
    
    parsed = []
    i = 0
    tmp = ''
    contains_repetiton = False
    while i < len(token):
        if i+1 < len(token) and token[i] == token[i+1] and i+2 < len(token) and token[i] == token[i+2]:
            contains_repetiton = True
            if tmp :
                parsed.append((tmp,False))
            c = token[i]
            i = i+3
            while i < len(token) and token[i] == c:
                i += 1
            parsed.append((c,True))
            tmp = ''
        else:
            tmp += token[i]
            i += 1
    if tmp :
        parsed.append((tmp, False))
    if contains_repetiton :
        return parsed
    else:
        return False

In [3]:
test_repetition("tomorrrroooooow")

[('tomo', False), ('r', True), ('o', True), ('w', False)]

In [4]:
def create_word(parsed, string = ''):
    """ Returns the list all the different combinations of words from a token previously parsed by `test_repetition` 
        by reducing the number of consecutive repetition of a character to one or two.
        For example, for the token "tomorrrroooooow", `test_repetition` parses it into :
            [('tomo', False), ('r', True), ('o', True), ('w', False)]
        and then, this function returns these combinations : 
            ['tomorow', 'tomoroow', 'tomorrow', 'tomorroow']
    """
    if not parsed:
        return [string]
    else :
        tmp, is_repeted = parsed.pop(0)
        if is_repeted:
            return create_word(parsed.copy(), string + tmp) + create_word(parsed.copy(), string + 2 * tmp)
        else :
            return create_word(parsed.copy(), string + tmp)

In [5]:
create_word(test_repetition("tomorrrroooooow"))

['tomorow', 'tomoroow', 'tomorrow', 'tomorroow']

`cat *.csv | sed 's/ \r//' | tr -d '\r' | grep "^[^ ]*$" | sort | uniq > ../list_words_en.txt`

In [8]:
#loading the list of English words containing doubles letters
fp = open("ressources/list_words_en_2.txt")
line = fp.readline()
dict_en = set()
while line:
    dict_en.add(line[:-1])
    line = fp.readline()
fp.close()

In [13]:
test = pd.read_csv("sent.1000", header=None)

In [97]:
out = open("test1000.txt", "w")
for line in test[0].values[:]:
    #read the file line by line
    tokens = word_tokenize(line)
    #tokenize the line using nltk.word_tokenize
    tokens2 = tokens.copy()
    for i, token in enumerate(tokens):
        # then iterate on it
        # merge all the consecutive punctuation that has been splited into several tokens
        # for example a string like "!!!" will be tokenized by nltk.word_tokenize into ['!', '!', '!']
        # we merge it into only one token : '!!!'
        if token in string.punctuation and i+1 < len(tokens) and tokens[i+1] == token:
            t = 1
            while i+t+1<len(tokens) and tokens[i+t+1] == token:
                t+=1
            tokens[i] = ''.join([token for j in range(t+1)])
            for j in range(t):
                tokens.pop(i+1)
    # iterate on it again to aplly some normalization methods on each token            
    for i, token in enumerate(tokens):
        if re.search("(a+h+){2,}|(h+a+){2,}|^aha$", tokens[i]): # normalizing the laughs into "AHAHA"
            tokens[i] = "AHAH"
        if re.search("([a-z]+\.\.[a-z]|\.\.[a-z]|[a-z]+\.\.)", tokens[i]):
            splited =  re.split("\.\.", tokens[i])
            tokens[i] = splited.pop(0)
            if not tokens[i]:
                tokens[i] = splited.pop(0)
            j = i+1
            while splited:
                tokens.insert(j, splited.pop(0))
                j += 1
        if re.search("[a-zA-Z0-9]\.(com|net|org)(/.*)?", tokens[i]):
            tokens[i] = 'urllink'
        if re.search("([a-z]{2,}\.[a-z]{2,})+", tokens[i]):
            splited = tokens[i].split('.')
            tokens[i] = splited.pop(0)
            j = i+1
            while splited:
                tokens.insert(j, splited.pop(0))
                j += 1
        if re.search("[a-z]", token) and test_repetition(tokens[i]):
            #cleaning the words that contains consecutive repetion of a letter 3 times or more
            parsed = test_repetition(tokens[i])
            words = create_word(parsed)
            tokens[i] = words[0]
            for word in words[1:]:
                if word in dict_en:
                    tokens[i] = word
                    break
        if re.search("/$", tokens[i]):
            print(tokens[i])
    out.write(' '.join(tokens) + '\n')
out.close()

/
/
/
w/
w/
night/
cages/
masks/
goggles/
morning/
play/
passing/
sprinting/
throwing/
pillow/
beating/
him/
seat/
story/
father/
somebody/
nobody/
him/
destiny/
\m/


In [84]:
re.search("([a-z]+\.[a-z]+)+", "empty.hungry.feinin")

<re.Match object; span=(0, 12), match='empty.hungry'>

In [93]:
re.search("[a-zA-Z0-9]\.(com|net|org)(/.*)?", "www.livejournal.com/cielelric")

<re.Match object; span=(14, 29), match='l.com/cielelric'>