In [1]:
import csv
import functools
import gzip
import numpy as np
import pandas as pd
import torch
import tempfile

from gensim.models import KeyedVectors
from gensim.parsing import preprocessing
from gensim.scripts.glove2word2vec import glove2word2vec
from torch.utils.data import Dataset, DataLoader, IterableDataset

In [11]:
path = 'data/meli-challenge-2019/spanish.train.csv.gz'
ds = pd.read_csv(path)

In [12]:
ds.head()

Unnamed: 0,language,label_quality,title,category
0,spanish,reliable,Bateria Completa 5 Cuerpos Excelente,DRUMS
1,spanish,reliable,Cuaderno Anotador Espiral Ben 10 3d Original ...,NOTEBOOKS_AND_WRITING_PADS
2,spanish,reliable,Fifa18 Ps4 Disco Fisico,VIDEO_GAMES
3,spanish,reliable,Botines Futbol adidas Messi 15.4 Cesped Hombre,FOOTBALL_SHOES
4,spanish,reliable,Chops Sublimados - Nagual,DRINKING_GLASSES


In [22]:
from sklearn import preprocessing as le
le = preprocessing.LabelEncoder()
ds['category_encoded'] = le.fit_transform(ds.category)

In [23]:
ds.head()

Unnamed: 0,language,label_quality,title,category,category_encoded
0,spanish,reliable,Bateria Completa 5 Cuerpos Excelente,DRUMS,196
1,spanish,reliable,Cuaderno Anotador Espiral Ben 10 3d Original ...,NOTEBOOKS_AND_WRITING_PADS,422
2,spanish,reliable,Fifa18 Ps4 Disco Fisico,VIDEO_GAMES,604
3,spanish,reliable,Botines Futbol adidas Messi 15.4 Cesped Hombre,FOOTBALL_SHOES,262
4,spanish,reliable,Chops Sublimados - Nagual,DRINKING_GLASSES,194


In [19]:
le.classes_

array(['3D_PRINTERS', 'ACCORDIONS', 'ACOUSTIC_GUITARS', 'ACTION_FIGURES',
       'ADHESIVE_TAPES', 'AEROBIC_CRUNCH_MACHINES', 'AIRBRUSHES',
       'AIRSOFT_GUNS', 'AIR_COMPRESSORS', 'AIR_CONDITIONERS',
       'AIR_MATTRESSES', 'ALARMS_AND_SENSORS', 'ALL_IN_ONE',
       'ALTERNATOR_PULLEYS', 'AM_FM_RADIOS', 'ANALOG_CAMERAS',
       'ANIMAL_CLIPPERS', 'ANTIVIRUS_AND_INTERNET_SECURITY',
       'AQUARIUM_FILTERS', 'ARCHERY_BOWS', 'AUDIO_AMPLIFIERS',
       'AUDIO_INTERFACES', 'AUTOMOBILE_FENDER_LINERS',
       'AUTOMOTIVE_AIR_FILTERS', 'AUTOMOTIVE_AMPLIFIERS',
       'AUTOMOTIVE_CLUTCH_KITS', 'AUTOMOTIVE_DOORS', 'AUTOMOTIVE_FENDERS',
       'AUTOMOTIVE_FRONT_BUMPERS', 'AUTOMOTIVE_HEADLIGHTS',
       'AUTOMOTIVE_OIL_FILTERS', 'AUTOMOTIVE_REAR_BODY_TAIL_LIGHT_PANELS',
       'AUTOMOTIVE_SEATS', 'AUTOMOTIVE_SHOCK_ABSORBERS',
       'AUTOMOTIVE_SIDE_VIEW_MIRRORS', 'AUTOMOTIVE_SPRING_SUSPENSIONS',
       'AUTOMOTIVE_STEERING_WHEELS', 'AUTOMOTIVE_TIRES',
       'AUTOMOTIVE_WATER_PUMPS', 'AV_RECE

In [26]:
class MeliDataset(Dataset):
    def __init__(self, path, transform=None):
        self.dataset = pd.read_csv(path)
        le = preprocessing.LabelEncoder()
        self.dataset['category_encoded'] = le.fit_transform(ds.category)
        self.transform = transform
    
    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, item):
        if torch.is_tensor(item):
            item = item.tolist()  # Deal with list of items instead of tensor
        
        item = {
            "data": self.dataset.iloc[item]["title"],
            "target": self.dataset.iloc[item]["category_encoded"]
        }

        if self.transform:
            item = self.transform(item)
        
        return item

dataset = MeliDataset(path)
print(f"Dataset loaded with {len(dataset)} elements")
print(f"Sample element:\n{dataset[0]}")

Dataset loaded with 6119100 elements
Sample element:
{'data': 'Bateria Completa 5 Cuerpos Excelente ', 'target': 196}


In [None]:
class TextPreprocess:
    def __init__(self, filters=None):
        if filters:
            self.filters = filters
        else:
            self.filters = [
                lambda s: s.lower(),
                preprocessing.strip_tags,
                preprocessing.strip_punctuation,
                preprocessing.strip_multiple_whitespaces,
                preprocessing.strip_numeric,
                preprocessing.remove_stopwords,
                preprocessing.strip_short,
            ]
        
    def _preprocess_string(self, string):
        return preprocessing.preprocess_string(string, filters=self.filters)

    def _encode_target(self, target):
        return 1 if target == "positive" else 0

    def __call__(self, item):
        if isinstance(item["data"], str):
            data = self._preprocess_string(item["data"])
        else:
            data = [self._preprocess_string(d) for d in item["data"]]
        
        if isinstance(item["target"], str):
            target = self._encode_target(item["target"])
        else:
            target = [self._encode_target(t) for t in item["target"]]
        
        return {
            "data": data,
            "target": target
        }

preprocess = TextPreprocess()
print(preprocess(dataset[0]))