In [1]:
import pandas as pd
from collections import Counter
import nltk

In [178]:
_DATA_FILEPATH = '../data/dataninja2019_ads_train.csv'
_POLISH_STOPWORDS_FILEPATH = "../src/polish_stopwords.txt"
_TOKENIZER_FILEPATH = '../src/dataNinjaTokenizer.txt'

In [3]:
df = pd.read_csv(_DATA_FILEPATH)

In [4]:
df.head()

Unnamed: 0,id,title,description,category_id,region_id,subregion_id,city_id,district_id,params,created_at,labels
0,1432081133,GLIKOL Płyn Chłodniczy Koncentrat Centra. Ogrz...,Nawiąże współpracę z odbiorcami płynów eksploa...,168,13,88,1104129800,,,2012-05-20 23:26:34,glikol_koncentrat wspolpraca koncentrat_do_chl...
1,671872279,Instalacje Gazowe/Przyłącza Gazowe/Sieci gazow...,BARTES świadczy usługi w obejmujące budowę pro...,306,7,136,284937915,1285174000.0,,2012-06-28 09:00:20,kuchnia_gazowa gazowe przylacza_gazowe przylac...
2,778099925,Elegancki kuferek drewniany + 5 palet na 2 zł ...,Kuferek drewniany z paletami na 200 monet 2 zł...,431,11,260,405646303,67644020.0,price<=>price<br>price<=>120,2012-07-09 07:40:37,palety_drewniane
3,1610095002,"Remont łazienki, remont mieszkania, domów, lok...","DROBNE I KOMPLEKSOWE REMONTY MIESZKAŃ , BIUR ,...",306,6,48,689879597,898740100.0,,2012-06-27 03:55:33,remont_mieszkania remont_lazienki budowa_domow
4,1884757810,"Karcher Profesjonalne Pranie i czyszczenie, oz...","SAMOCHODY (osobowe, dostawcze, ciężarowe, przy...",354,1,259,1797088492,,,2012-07-04 14:43:56,karcher ozonowanie czyszczenie


In [228]:
class DataNinjaTokenizer:
    stopwords = open(_POLISH_STOPWORDS_FILEPATH).read().split("\n")
    
    def __init__(self, series=None, data_frame=None, headers=None):
        assert not (series is None and data_frame is None), "Cannot parse both series and data_frame"
        if(series is not None):
            self.__vocabulary = self.parse_series_into_words(series)
        if(data_frame is not None):
            assert headers is not None, "Headers are required to tokenize data_frame"
            self.__vocabulary = []
            for header in headers:
                self.__vocabulary += self.parse_series_into_words(data_frame[header])
        self.prepare_vocabulary_frequency()
        self.prepare_vocabulary_ints()
    
    def parse_series_into_words(self, series):
        words = nltk.word_tokenize(series.str.cat(sep=' '), language="polish")
        return [word.lower() for word in words if word.isalpha() and word.lower() not in self.stopwords]

    def prepare_vocabulary_frequency(self):
        assert self.__vocabulary is not None, "Vocabulary cannot be none"
        self.__counts = Counter(self.__vocabulary)
                
    def prepare_vocabulary_ints(self):
        self.__vocabulary_ints = {tup:i for i, tup in enumerate(set(self.__vocabulary))}
            
    @property
    def vocabulary_frequency(self):
        return self.__counts
            
    @property
    def vocabulary(self):
        return self.__vocabulary
    
    @property
    def vocabulary_ints(self):      
        return self.__vocabulary_ints

In [231]:
temp = DataNinjaTokenizer(series=df['title'][:100])

In [234]:
temp = DataNinjaTokenizer(data_frame=df, headers=['title', 'description'])

In [233]:
temp.vocabulary_frequency.most_common(15)

[('city', 8),
 ('nowy', 5),
 ('usługi', 3),
 ('drewno', 3),
 ('spodnie', 3),
 ('wykończenia', 3),
 ('wnętrz', 3),
 ('nowe', 3),
 ('nowa', 3),
 ('pokoje', 3),
 ('chłodnica', 2),
 ('instalacje', 2),
 ('remont', 2),
 ('łazienki', 2),
 ('profesjonalne', 2)]

In [226]:
len(temp.vocabulary_ints)

423432

In [222]:
len(temp.vocabulary)

2246506

In [190]:
import pickle

In [None]:
filehandler = open(_TOKENIZER_FILEPATH, 'wb')
pickle.dump(temp, filehandler)

In [218]:
filehandler = open(_TOKENIZER_FILEPATH, 'rb')
temp1 = pickle.load(filehandler)

In [220]:
len(temp.vocabulary_ints)

146835

In [223]:
len(temp.vocabulary)

2246506