In [2]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
import string

In [19]:
df_train = pd.read_csv('X_train_update.csv')
y_train = pd.read_csv('Y_train_CVw08PX.csv')

In [4]:
df_train.shape

(84916, 5)

In [5]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,designation,description,productid,imageid
0,0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046
1,1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237
2,2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978
3,3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496
4,4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786


In [6]:
df_train['class']=y_train['prdtypecode']

In [7]:
df_train.head()

Unnamed: 0.1,Unnamed: 0,designation,description,productid,imageid,class
0,0,Olivia: Personalisiertes Notizbuch / 150 Seite...,,3804725264,1263597046,10
1,1,Journal Des Arts (Le) N° 133 Du 28/09/2001 - L...,,436067568,1008141237,2280
2,2,Grand Stylet Ergonomique Bleu Gamepad Nintendo...,PILOT STYLE Touch Pen de marque Speedlink est ...,201115110,938777978,50
3,3,Peluche Donald - Europe - Disneyland 2000 (Mar...,,50418756,457047496,1280
4,4,La Guerre Des Tuques,Luc a des id&eacute;es de grandeur. Il veut or...,278535884,1077757786,2705


In [8]:
def normalize_text(text):
    text = text.lower().strip()
    # Only ascii
    text = text.encode('ascii', 'ignore').decode()
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Get rid of extra spaces
    text = ' '.join(text.split())
    return text

In [9]:
split_share = np.array([0.8])
split_at = (len(df_train) * split_share).astype(int)

iids_per_split = np.array_split(df_train.index, split_at)

split_names = ['train', 'test']
for split_name, iids in zip(split_names, iids_per_split):
    df_train.loc[iids, 'split'] = split_name

In [10]:
df_train['split'].value_counts()

train    67932
test     16984
Name: split, dtype: int64

In [11]:
df_train['class'].isna().sum()

0

In [12]:
df_train=df_train.dropna(subset=['class'])

In [13]:
text_list =[]
for row_id, row in df_train.iterrows():
    text = str(row['designation'])+str(row['description'])
    text_list.append(text)

df_train['text'] = text_list

In [14]:
# Create fasttext .txt files
for split_name in ['test','train']:
    with Path(f"fasttextkids_{split_name}.txt") .open("w") as f:
        lines_to_save = {}
        for row_id, row in df_train.query("split==@split_name").iterrows():
            label, text = str(int(row["class"])), normalize_text(row["text"])
            lines_to_save[row_id] = (label.replace(" ", "_"), text)
            line = "__label__{} {}".format(label.replace(" ", "_"), text)
            f.write(line+"\n")

In [29]:
import fasttext
model_params = {
    'input': "fasttextkids_train.txt",
    'autotuneValidationFile': "fasttextkids_test.txt",
    'autotuneDuration': 500,
   }
# 'dim': 16,'wordNgrams': 2
model_fasttext = fasttext.train_supervised(**model_params)

Progress: 100.0% Trials:   12 Best score:  0.813884 ETA:   0h 0m 0s
Training again with best arguments
Read 5M words
Number of words:  211955
Number of labels: 27
Progress: 100.0% words/sec/thread:  539362 lr:  0.000000 avg.loss:  0.062192 ETA:   0h 0m 0s  3.7% words/sec/thread:  205032 lr:  0.245655 avg.loss:  1.058080 ETA:   0h 4m42s 19.9% words/sec/thread:  264867 lr:  0.204365 avg.loss:  0.253862 ETA:   0h 3m 1s 34.4% words/sec/thread:  379532 lr:  0.167294 avg.loss:  0.154968 ETA:   0h 1m43s 85.2% words/sec/thread:  524388 lr:  0.037820 avg.loss:  0.071173 ETA:   0h 0m17s


In [30]:
# Evaluate fasttext model
model_result = model_fasttext.test("fasttextkids_test.txt")
print(f"{model_result=}")  ## retourne sample, précision, recall.
model_fasttext.save_model("trained_model.bin")

model_result=(16984, 0.813177107866227, 0.813177107866227)


In [24]:
df_test =pd.read_csv('/Users/lucashennecon/Documents/Cours_CS/Infonum/EI/X_test_update.csv')

In [25]:
label_list = []
for row_id, row in df_test.iterrows():
    text = str(row['designation'])+str(row['description'])
    text = normalize_text(text)
    label, _ = model_fasttext.predict(text)
    label = label[0][9:]
    label_list.append(label)

dict = {'label': label_list}
df = pd.DataFrame(dict) 

df.to_csv('label.csv') 




In [26]:
df.head()

Unnamed: 0,label
0,10
1,1160
2,2583
3,2583
4,2522
