In [1]:
import pandas as pd

from stop_words import get_stop_words
from nltk import FreqDist
from gensim import corpora

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

import gc
from joblib import dump, load

import plotly_express as px

## Read data files

In [2]:
file = 'data/train.csv'
data_train = pd.read_csv(file)

In [3]:
data_train.head()

Unnamed: 0,title,label_quality,language,category
0,Hidrolavadora Lavor One 120 Bar 1700w Bomba A...,unreliable,spanish,ELECTRIC_PRESSURE_WASHERS
1,Placa De Sonido - Behringer Umc22,unreliable,spanish,SOUND_CARDS
2,Maquina De Lavar Electrolux 12 Kilos,unreliable,portuguese,WASHING_MACHINES
3,Par Disco De Freio Diant Vent Gol 8v 08/ Frema...,unreliable,portuguese,VEHICLE_BRAKE_DISCS
4,Flashes Led Pestañas Luminoso Falso Pestañas P...,unreliable,spanish,FALSE_EYELASHES


In [4]:
# data_train.describe().iloc[0:2]

## Preprocessing

In [5]:
data_pt = data_train.loc[data_train['language'] == 'portuguese']
del data_train

In [6]:
data_pt.drop(columns='label_quality')
data_pt.drop_duplicates(subset ="title", 
                           keep = False,
                           inplace = True) 

In [7]:
data_pt['title'] = data_pt['title'].str.replace("[^a-zA-Z#]", "")
data_pt['title'] = data_pt['title'].str.replace("#[a-zA-Z]", "")
data_pt['title'] = data_pt['title'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
data_pt['title'] = data_pt['title'].str.lower()

In [8]:
category_pt = data_pt['category']
data_pt = data_pt['title']

In [9]:
category_pt.to_csv(r'category_pt.csv', index=False, header=False)

In [10]:
def remove_stopwords(rev, stop_words):
    rev_new = " ".join([i for i in rev if i not in stop_words])
    return rev_new

stopwords = get_stop_words('portuguese')
data_pt = [remove_stopwords(r.split(), stopwords) for r in data_pt]

In [11]:
def freq_words(x, terms = 40):
    all_words = ' '.join([text for text in x])
    all_words = all_words.split()

    fdist = FreqDist(all_words)
    words_df = pd.DataFrame({'word':list(fdist.keys()), 'count':list(fdist.values())})

    # selecting top 20 most frequent words
    # data = words_df.nsmallest(columns='count', n=terms) 
    data = words_df.nlargest(columns='count', n=terms)
    fig = px.bar(data, x = "word", y = "count")
    fig.show()

In [12]:
# freq_words(data_pt)

In [13]:
data_pt = pd.Series(data_pt).apply(lambda x: x.split())

Filter extremes removes all tokens in the dictionary that are:

Less frequent than no_below documents (absolute number, e.g. 5) or
More frequent than no_above documents (fraction of the total corpus size, e.g. 0.3).
After (1) and (2), keep only the first keep_n most frequent tokens (or keep all if keep_n=None).

In [14]:
dictionary = corpora.Dictionary(data_pt)
dictionary.filter_extremes(no_below=10, no_above=0.8)
dictionary.save_as_text('dictionary')
#del dictionary

In [None]:
# data_pt.to_csv(r'data_pt.csv', index=False, header=False)
# del data_pt





In [16]:
doc_term_matrix = [dictionary.doc2bow(rev) for rev in data_pt]

In [None]:
xTrain = TfidfTransformer().fit_transform(doc_term_matrix)

In [None]:
loaded_category = pd.read_csv('category_pt')
clf = MultinomialNB().fit(xTrain, loaded_category.index.values.astype(int))
dump(clf, 'model.joblib') 