In [1]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle

filepath_in = 'data/sample_data_for_task1.csv'
filepath_out = 'data/processed_data.csv'

### Loading and inspecting the dataset

In [3]:
df = pd.read_csv(filepath_in)
df.head()

Unnamed: 0,text,label
0,zucker fabrik,ft
1,Lebensmittel kommssionierung,ft
2,geländer biegen,mr
3,gebäudeausrüstung technische,ct
4,kürbiskernöl softgels,ft


In [4]:
# We see that we have over 37.000 samples
df.shape

(37295, 2)

In [5]:
# Our target variable will be 'label'
# Let's have a look at the distribution of its values
df['label'].value_counts()

label
ft     11226
pkg     9617
ct      5061
mr      5016
ch      3688
cnc     2587
Name: count, dtype: int64

In [6]:
# We remove the 100 samples that don't have a label
display(df.isna().sum())
df.dropna(inplace=True)

text       0
label    100
dtype: int64

In [7]:
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label'])

In [8]:
with open('label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

### Vectorize the texts

In [6]:
vectorizer = CountVectorizer()
tokenizer = vectorizer.build_tokenizer()
tokenized_text = df['text'].apply(tokenizer)

In [13]:
embedding_dim = 50 
window_size = 1
min_count = 2

model = Word2Vec(
    sentences=tokenized_text, 
    vector_size=embedding_dim, 
    window=window_size, 
    min_count=min_count
)


In [14]:
def text_to_embeddings(text):
    tokens = tokenizer(text)
    embeddings = []
    for token in tokens:
        if token in model.wv:
            embeddings.append(model.wv[token])
    if embeddings:
        return np.mean(embeddings, axis=0)  # Average of word embeddings in the text
    else:
        return np.zeros(embedding_dim)  # If no embeddings found, return zero vectors


In [15]:
df['embeddings'] = df['text'].apply(text_to_embeddings)
col_list = [f'emb_{i}' for i in range(embedding_dim)]
df[col_list] = pd.DataFrame(df['embeddings'].tolist(), index= df.index)
df_out = df.drop(columns=['text', 'embeddings'])

In [16]:
df_out.shape

(37195, 51)

In [17]:
df_out.to_csv(filepath_out, index=False)