In [1]:
import pandas as pd
import numpy as np
from typing import Tuple, Dict

import re
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

import pickle
import lightgbm as lgb

In [2]:
# Later all functions should be imported from code

def train_lgb(x_train, y_train, x_test, y_test):
    eval_set = [(x_train, y_train), (x_test, y_test)]
    train_data = eval_set[0]

    lgb_estimator = lgb.LGBMRegressor(objective='multiclass',
                                      num_class=4,
                                      metric='multi_logloss',
                                      min_data_in_leaf=5,
                                      n_estimators=1000,
                                      max_depth=9,
                                      num_leaves=2**9)

    lgb_estimator.fit(*train_data, eval_set=eval_set, early_stopping_rounds=50)
    return lgb_estimator


def stemming(text):
    text_letters_only = re.sub('[^a-z]',' ',text)
    splitted = text_letters_only.split()
    
    stemmed = [word for word in splitted if not word in stopwords.words('german')]

    return ' '.join(stemmed)


def save_model(model, path):
    pickle.dump(model, open(path, 'wb'))


def load_models(path):
    tfidf_model = pickle.load(open(f'{path}/tfidf_model.csv', 'rb'))
    lgb_model = pickle.load(open(f'{path}/lgb_model.csv', 'rb'))
    df = pd.read_csv(f'{path}/class_info.csv')
    clases_dict = {row['productgroup_id']: row['productgroup']  for _, row in df.iterrows()}
    return tfidf_model, lgb_model, clases_dict

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/olga.sisyuk/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
df=pd.read_csv('MLE_Task/testset_C.csv', sep=';',)

In [7]:
df.head()

Unnamed: 0,id,productgroup,main_text,add_text,manufacturer
0,26229701,WASHINGMACHINES,WAQ284E25,WASCHMASCHINEN,BOSCH
1,16576864,USB MEMORY,LEEF IBRIDGE MOBILE SPEICHERERWEITERUNG FUER I...,PC__1100COMPUTINGMEMORY__1110MEMORYCARDS,LEEF
2,26155618,USB MEMORY,SANDISK 32GB ULTRA FIT USB 3.0,W1370,
3,25646138,BICYCLES,HOLLANDRAD DAMEN 28 ZOLL TUSSAUD 3-GAENGE RH 5...,FAHRRAEDER // SPORTFAHRRAEDER,SCHALOW & KROH GMBH
4,19764614,BICYCLES,DAHON SPEED D7 SCHWARZ ? FALTRAD,SPORTS__30000WHEELED__30070BIKES,DAHON


In [8]:
# check if target field has nan values and how many nans are in other column
pd.isna(df).sum()

id                 0
productgroup       0
main_text          2
add_text           0
manufacturer    1344
dtype: int64

In [9]:
# at this stage for us it is ok replace nan by empty string
df=df.fillna('')

In [10]:
# check if we replaced all nans
pd.isna(df).sum()

id              0
productgroup    0
main_text       0
add_text        0
manufacturer    0
dtype: int64

In [11]:
# check target distribution
df['productgroup'].value_counts()

CONTACT LENSES     2000
BICYCLES           2000
USB MEMORY         2000
WASHINGMACHINES    2000
Name: productgroup, dtype: int64

In [12]:
# check how main text look like
df['main_text'].value_counts()

WASCHVOLLAUTOMAT OMV510A+ OMV510A+|| EEK:A+, 1000 U/MIN, 5KG KAPAZITAET  5043487    85
PROCLEAR TORIC6 STÜCKUNISEX                                                         75
AIR OPTIX FOR ASTIGM.6 STÜCKUNISEX                                                  74
BIOFINITY TORIC 6ER BOX6 STÜCKUNISEX                                                71
SOFLENS TORIC6 STÜCKUNISEX                                                          68
                                                                                    ..
WAW284DE                                                                             1
CANDY GO W 496 D WASCHTROCKNER 9/6 KG CANDYGOW496D                                   1
AEG WASCHMASCHINE L71360TL                                                           1
BOCAS TRK300 DAMEN SCHWARZ MATT 45 CM                                                1
BEKO WASCHMASCHINE WML 15106 MNE+, A+, 5 KG, 1000 U/MIN 289248                       1
Name: main_text, Length: 6644, dtype: int64

In [13]:
# make all columns lower case
df['productgroup']=df['productgroup'].str.lower()
df['main_text']=df['main_text'].str.lower()
df['add_text']=df['add_text'].str.lower()
df['manufacturer']=df['manufacturer'].str.lower()

In [None]:
# combine all available text together and apply stemming - remove non alpth chars and stopwoords
df['combined'] = pd.Series([' '.join(text) for text in df[['main_text','add_text','manufacturer']].values])
df['combined'] = df['combined'].apply(stemming)

In [None]:
# add numeric representation for target col
df['productgroup_id']=df['productgroup'].factorize()[0]

In [None]:
# train tfidf to convert words to numeric vectors
tfidf = TfidfVectorizer(ngram_range=(1, 2), stop_words=stopwords.words('german'))
features = tfidf.fit_transform(df.combined).toarray()
labels = df['productgroup_id']


In [None]:
# train - test split
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.20, random_state = 0)

In [None]:
# Train lgb model
lgb_model = train_lgb(X_train, y_train, X_test, y_test)

In [None]:
# Predictions with probabilities for all classes
pred_train = lgb_model.predict(X_train)
pred_test = lgb_model.predict(X_test)

In [None]:
df[['productgroup_id','productgroup']].drop_duplicates()

In [None]:
# generate df with all predictions to check it manually
df_test=pd.DataFrame(pred_test, columns=['washingmachines','usb memory','bicycles','contact lenses'])
df_test['ground_truth']=y_test.values

In [None]:
# fetch class with max probability and mark it as prediction
pred_one_test=pd.Series([np.argmax(x)for x in pred_test])
df_test['prediction']=pred_one_test

In [None]:
# check what matched with actual classes
df_test['matched']=(df_test['prediction']==df_test['ground_truth'])

In [None]:
# join back text features
df_test=df_test.join(df)

In [None]:
df_test[df_test['matched']].head(20)

In [None]:
# print acc for train and test
pred_one_train=[np.argmax(x)for x in pred_train]
pred_one_test=[np.argmax(x)for x in pred_test]

print(f'Training data accuracy: {accuracy_score(pred_one_train, y_train)}')
print(f'Test data accuracy {accuracy_score(pred_one_test, y_test)}')
