In [1]:
import numpy as np 
import pandas as pd
import gc

# feature libraries
from scipy.sparse import hstack, csr_matrix
from sklearn import preprocessing
import os, re, regex, string, codecs
import multiprocessing as mp
from keras.preprocessing.text import Tokenizer


Using TensorFlow backend.


In [2]:
data_dir = '../input/kaggle_data'
text_dir = '../input/text'

In [4]:
# input/output ---------------
def load_text(f):
    text_cols = ['category_name', 'param_1', 'param_2', 'param_3', 'title', 'description']
    df = pd.read_csv(f, usecols=text_cols, encoding='utf-8-sig').fillna('nan')
    return df[text_cols].apply(lambda x: ''.join(x), axis=1).tolist()

def save_text(file_name, text_list):
    save_name = f'{text_dir}/{file_name}.txt'
    file = codecs.open(save_name, 'w', 'utf-8-sig')
    for i in text_list: file.write(f'{i}\n')
    file.close()
    return 

# text processing ---------------
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('russian'))
def remove_stopwords(text):
    return ' '.join([w for w in text.split() if not w in STOPWORDS])

    
SPLIT_PATTERN = re.compile(u' |\n')        
def clean_text(text):
    text = bytes(text, encoding='utf-8')
    text = text.replace(b'\n', b' ')
    text = text.replace(b'\t', b' ')
    text = text.replace(b'\b', b' ')
    text = text.replace(b'\r', b' ')
    text = regex.sub(b'\s+', b' ', text)
    text = str(text, 'utf-8')
    return ' '.join(re.split(SPLIT_PATTERN, text.strip()))

# keep only alphanumeric and punctuation
REGEX_0 = regex.compile(r'\W+\P+')
def clean_level_0(text):
    text = text.lower()
    text = REGEX_0.sub(' ', text)
    return clean_text(text)

# remove punctuation and stopwords
REGEX_1 = regex.compile(r'[\W_]+')
def clean_level_1(text):
    text = text.lower()
    text = REGEX_1.sub(' ', text)
    text = remove_stopwords(text)
    return clean_text(text)    

# keep only letters
REGEX_2 = regex.compile(r'\d+')
def clean_level_2(text):
    text = text.lower()
    text = REGEX_1.sub(' ', text)
    text = REGEX_2.sub(' ', text)
    text = remove_stopwords(text)
    return clean_text(text)    

# tokenize words
def tokenize(text_list):
    pattern = re.compile(u' |\n')
    text = {w for text in text_list for w in re.split(pattern, text.strip())}
    
    word_tokenizer = Tokenizer(filters='', lower=False)
    word_tokenizer.fit_on_texts(text)
    return list(word_tokenizer.word_index.keys())

print(clean_level_0('\n678[]{}-\r=_+() \nнийотстегивающийся «зубр более больше   a а'))
print(clean_level_1('\n678[]{}-\r=_+() \nнийотстегивающийся «зубр более больше   a а'))
print(clean_level_2('\n678[]{}-\r=_+() \nнийотстегивающийся «зубр более больше   a а'))

678[]{}- =_+() нийотстегивающийся «зубр более больше a а
678 нийотстегивающийся зубр a
нийотстегивающийся зубр a


In [None]:
# load data
test = load_text(f'{data_dir}/test.csv')
train = load_text(f'{data_dir}/train.csv')
test_active = load_text(f'{data_dir}/test_active.csv')
train_active = load_text(f'{data_dir}/train_active.csv')

# setup multiprocessing
run_parallel = False
if run_parallel: 
    p = mp.Pool(12)

cleaners = [clean_level_0, clean_level_1, clean_level_2]
for i in range(3):
        
    print(i)
    
    # clean text
    if run_parallel:      
        test_cleaned = p.map(cleaners[i], test)
        train_cleaned = p.map(cleaners[i], train)
        test_active_cleaned = p.map(cleaners[i], test_active)
        train_active_cleaned = p.map(cleaners[i], train_active)
    else:
        cleaner = cleaners[i]
        test_cleaned = [cleaner(x) for x in test]
        train_cleaned = [cleaner(x) for x in train]
        test_active_cleaned = [cleaner(x) for x in test_active]
        train_active_cleaned = [cleaner(x) for x in train_active]

    # combine
    full_cleaned = train_cleaned+test_cleaned+train_active_cleaned+test_active_cleaned
    save_text(f'text_{i}', full_cleaned)
    save_text(f'test_text_{i}', test_cleaned)
    save_text(f'train_text_{i}', train_cleaned)
    del full_cleaned, test_active_cleaned, train_active_cleaned
    
    # tokenize
    tokens = ['token']+tokenize(test_cleaned+train_cleaned)
    save_text(f'tokens_{i}', tokens)
    del test_cleaned, train_cleaned

0
