In [497]:
import pandas as pd
import numpy as np

# from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split

In [471]:
!pip3 freeze > requirements.txt

In [494]:
df = pd.read_csv("../testset_C.csv", sep=";")

In [502]:
df

Unnamed: 0,id,productgroup,main_text,add_text,manufacturer
0,26229701,WASHINGMACHINES,WAQ284E25,WASCHMASCHINEN,BOSCH
1,16576864,USB MEMORY,LEEF IBRIDGE MOBILE SPEICHERERWEITERUNG FUER I...,PC__1100COMPUTINGMEMORY__1110MEMORYCARDS,LEEF
2,26155618,USB MEMORY,SANDISK 32GB ULTRA FIT USB 3.0,W1370,
3,25646138,BICYCLES,HOLLANDRAD DAMEN 28 ZOLL TUSSAUD 3-GAENGE RH 5...,FAHRRAEDER // SPORTFAHRRAEDER,SCHALOW & KROH GMBH
4,19764614,BICYCLES,DAHON SPEED D7 SCHWARZ ? FALTRAD,SPORTS__30000WHEELED__30070BIKES,DAHON
...,...,...,...,...,...
7995,61028716,BICYCLES,TOPSY KINDERRAD 12 1/2 POLARWEISS O O O 419640,H006W0792344__WERKZEUG_AUTO/FAHRRAD_FAHRRAEDER,SI-ZWEIRAD-VERTRIEBS GMBH
7996,37734138,BICYCLES,CREME ECHO SOLO 16-SPEED WHITE,FAHRRAEDER>>RENNRAEDER>>RENNRAEDER,CREME
7997,17891755,CONTACT LENSES,"ACUVUE 1-DAY MOIST TAGESLINSEN WEICH, 30 STUEC...",HEALTH&PERSONALCARE__3100OPTICS__3101SPHERICCO...,JOHNSON & JOHNSON
7998,42298563,BICYCLES,UNIVEGA TERRENO 1.0 HE MATTBLAUGRAU 45 CM,1_7_4,UNIVEGA


In [410]:
df.productgroup.value_counts(normalize=True)

WASHINGMACHINES    0.25
USB MEMORY         0.25
BICYCLES           0.25
CONTACT LENSES     0.25
Name: productgroup, dtype: float64

In [411]:
df.productgroup.value_counts()

WASHINGMACHINES    2000
USB MEMORY         2000
BICYCLES           2000
CONTACT LENSES     2000
Name: productgroup, dtype: int64

In [412]:
len(df.id.unique()), len(df)

(8000, 8000)

In [413]:
df["main_text"] = df["main_text"].str.lower()
df["add_text"] = df["add_text"].str.lower()
df["manufacturer"] = df["manufacturer"].str.lower()
df["manufacturer"] = df["manufacturer"].fillna("unknown")

In [414]:
df["text"] = df["main_text"] + " " + df["add_text"] + " " + df["manufacturer"]
df["text"] = df["text"].fillna("missed")

## Use main words to create binary features

In [415]:
df["is_usb"] = (
    df["text"].str.contains("usb") 
    | df["text"].str.contains("memory") 
    | df["text"].str.contains("datentraeger") 
    | df["text"].str.contains("speicher")
    | df["text"].str.contains("storage")
).astype("int")

In [416]:
df["is_usb"].value_counts()

0    6076
1    1924
Name: is_usb, dtype: int64

In [417]:
df[df["productgroup"]=="USB MEMORY"].is_usb.value_counts()

1    1919
0      81
Name: is_usb, dtype: int64

In [418]:
df["is_washingmachine"] = (
    df["text"].str.contains("wasch")
    | df["text"].str.contains("wash")
    | df["text"].str.contains("wasch")
    | df["text"].str.contains("wash")
).astype("int")

In [419]:
df["is_washingmachine"].value_counts()

0    6300
1    1700
Name: is_washingmachine, dtype: int64

In [420]:
df[df["productgroup"]=="WASHINGMACHINES"].is_washingmachine.value_counts()

1    1700
0     300
Name: is_washingmachine, dtype: int64

In [421]:
df["is_lenses"] = (
    df["text"].str.contains("linse")
    | df["text"].str.contains("lenses")
    | df["text"].str.contains("acuvue")
    | df["text"].str.contains("acumed")
    | df["text"].str.contains("contact")
    | df["text"].str.contains("dailies")
    | df["text"].str.contains("optix")
    | df["text"].str.contains("medic")
    | df["text"].str.contains("myday")
    | df["text"].str.contains("vision")
).astype("int")

In [422]:
df["is_lenses"].value_counts()

0    6135
1    1865
Name: is_lenses, dtype: int64

In [423]:
df[df["productgroup"]=="CONTACT LENSES"].is_lenses.value_counts()

1    1860
0     140
Name: is_lenses, dtype: int64

In [424]:
df["is_bicycle"] = (
    df["text"].str.contains("bike") 
    | df["text"].str.contains("bicycle")
    | df["text"].str.contains("fahrraeder")
    | df["text"].str.contains("fahrrad")
    | df["text"].str.contains("freizeit")
    | df["text"].str.contains("sport")
    | df["text"].str.contains("berg")
    | df["text"].str.contains("city")
    | df["text"].str.contains("cross")
).astype("int")

In [425]:
df["is_bicycle"].value_counts()

0    6496
1    1504
Name: is_bicycle, dtype: int64

In [426]:
df[df["productgroup"]=="BICYCLES"].is_bicycle.value_counts()

1    1485
0     515
Name: is_bicycle, dtype: int64

In [466]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df["main_text"] = df["main_text"].str.lower()
    df["add_text"] = df["add_text"].str.lower()
    df["manufacturer"] = df["manufacturer"].str.lower()
    df["manufacturer"] = df["manufacturer"].fillna("unknown")
    df["text"] = df["main_text"] + " " + df["add_text"] + " " + df["manufacturer"]
    df["text"] = df["text"].fillna("missed")
    
    df["is_usb"] = (
        df["text"].str.contains("usb") 
        | df["text"].str.contains("memory") 
        | df["text"].str.contains("datentraeger") 
        | df["text"].str.contains("speicher")
        | df["text"].str.contains("storage")
    ).astype("int")
    
    df["is_washingmachine"] = (
        df["text"].str.contains("wasch")
        | df["text"].str.contains("wash")
        | df["text"].str.contains("wasch")
        | df["text"].str.contains("wash")
    ).astype("int")
    
    df["is_lenses"] = (
        df["text"].str.contains("linse")
        | df["text"].str.contains("lenses")
        | df["text"].str.contains("acuvue")
        | df["text"].str.contains("acumed")
        | df["text"].str.contains("contact")
        | df["text"].str.contains("dailies")
        | df["text"].str.contains("optix")
        | df["text"].str.contains("medic")
        | df["text"].str.contains("myday")
        | df["text"].str.contains("vision")
    ).astype("int")
    
    df["is_bicycle"] = (
        df["text"].str.contains("bike") 
        | df["text"].str.contains("bicycle")
        | df["text"].str.contains("fahrraeder")
        | df["text"].str.contains("fahrrad")
        | df["text"].str.contains("freizeit")
        | df["text"].str.contains("sport")
        | df["text"].str.contains("berg")
        | df["text"].str.contains("city")
        | df["text"].str.contains("cross")
    ).astype("int")
    
    mapping_values = {
        "WASHINGMACHINES": 0,
        "USB MEMORY": 1,
        "BICYCLES": 2,
        "CONTACT LENSES": 3,
    }
    
    df["productgroup"] = df["productgroup"].replace(mapping_values)
    
    return df

In [467]:
df = pd.read_csv("../testset_C.csv", sep=";")
df = preprocess(df)

In [469]:
features = ['is_usb', 'is_washingmachine', 'is_lenses', 'is_bicycle']
target = "productgroup"

In [451]:
X = df[features]
y = df[target]

In [475]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=66)

In [501]:
y_test.to_csv("testset_labels.csv", index=False)

In [476]:
y_train.value_counts(), y_test.value_counts()

(2    1411
 3    1409
 0    1393
 1    1387
 Name: productgroup, dtype: int64,
 1    613
 0    607
 3    591
 2    589
 Name: productgroup, dtype: int64)

In [454]:
import xgboost as xgb

In [455]:
params = {
    'objective': 'multi:softmax',  # error evaluation for multiclass training
    'num_class': 4,
}

In [478]:
model = xgb.XGBClassifier(**params)
model.fit(X_train, y_train)

In [482]:
pred = model.predict(X_test)

In [483]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       1.00      0.85      0.92       607
           1       1.00      0.97      0.98       613
           2       0.80      1.00      0.89       589
           3       1.00      0.93      0.96       591

    accuracy                           0.94      2400
   macro avg       0.95      0.94      0.94      2400
weighted avg       0.95      0.94      0.94      2400



In [484]:
model.save_model("task_2_model.json")

In [496]:
confusion_matrix(y_test, pred)

array([[517,   0,  90,   0],
       [  0, 594,  19,   0],
       [  0,   0, 589,   0],
       [  0,   0,  40, 551]])

In [498]:
accuracy_score(y_test, pred)

0.9379166666666666