In [None]:
import pandas as pd

In [None]:
df = pd.read_csv('TRAIN-DATASET-PATH')
df.head(2).T


## Class Label Encoding

In [None]:
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()

df['hazard_category'] = label_encoder.fit_transform(df['hazard-category'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

label_mapping.items()

In [None]:
df['product_category'] = label_encoder.fit_transform(df['product-category'])

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

label_mapping.items()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf = TfidfVectorizer(sublinear_tf=True, max_df=0.75, max_features=3500)

In [None]:
features = tfidf.fit_transform(df['text'].astype(str)).toarray()
df['vector'] = list(features)


## Hazard Category

In [None]:
X = df['vector']

y = df['hazard_category']

In [None]:
# X = X.apply(lambda x: np.fromstring(x.strip('[]'), sep=' '))
import numpy as np
X = np.stack(X.values)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X.tolist(), y, test_size=0.1, random_state=42)
type(X_train)


In [None]:
ct = y_train.value_counts().reset_index()
print(ct)

In [None]:

minority_classes = [6]

X_majority = np.array(X_train)[~y_train.isin(minority_classes)]
y_majority = y_train[~y_train.isin(minority_classes)]
X_minority = np.array(X_train)[y_train.isin(minority_classes)]
y_minority = y_train[y_train.isin(minority_classes)]


In [None]:

from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=42)
X_resampled_majority, y_resampled_majority = sm.fit_resample(X_majority, y_majority)

X_combined = np.vstack((X_resampled_majority, X_minority))
y_combined = np.concatenate((y_resampled_majority, y_minority))

sm_minority = SMOTE(sampling_strategy='auto', k_neighbors=1, random_state=42)
X_final_resampled, y_final_resampled = sm_minority.fit_resample(X_combined, y_combined)

shuffled_indices = np.random.permutation(len(X_final_resampled))

# Shuffle both Xs and ys using the same shuffled indices
Xs_shuffled = X_final_resampled[shuffled_indices]
ys_shuffled = y_final_resampled[shuffled_indices]

Xs = Xs_shuffled
ys = ys_shuffled


In [None]:
X_train = np.array(Xs)
X_test = np.array(X_test)
y_train = ys

In [None]:
X_train.shape

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier(verbosity = 2)
classifier.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)
cm = confusion_matrix(y_test, y_pred)
print(cm)

print(classification_report(y_test, y_pred))

## Validation

In [None]:
val_df = pd.read_csv("TEST-DATASET-PATH")
val_df.columns

In [None]:
val_features = tfidf.transform(val_df["text"]).toarray()
val_df['vector'] = list(val_features)

In [None]:
vx = np.stack(val_df['vector'])

In [None]:
class_names = ["allergens","biological","chemical","food additives and flavourings","foreign bodies","fraud","migration","organoleptic aspects","other hazard","packaging defect"]


In [None]:
xgboost_pred = classifier.predict(vx)
predicted_class_names = [class_names[i] for i in xgboost_pred]


In [None]:
predicted_class_df = pd.DataFrame(predicted_class_names, columns=['hazard-category-xgboost'])

# Print the DataFrame to verify
print(predicted_class_df)

In [None]:
predicted_class_df.to_csv("submission_xg.csv", index=False)

In [None]:
print(classification_report(val_df['hazard-category'], predicted_class_df['hazard-category-xgboost']))

## Product Category

In [None]:
X_p = df['vector']

y_p = df['product_category']
X_p = np.stack(X_p.values)

ct = y_p.value_counts().reset_index()
print(ct)

In [None]:

from sklearn.model_selection import train_test_split

X_p_train, X_p_test, y_p_train, y_p_test = train_test_split(X_p.tolist(), y_p, test_size=0.2, random_state=42)
type(X_p_train)


In [None]:
ct = y_p_train.value_counts().reset_index()
print(ct)

In [None]:

minority_classes = [21,6,8,11,7]

X_p_majority = np.array(X_p_train)[~y_p_train.isin(minority_classes)]
y_p_majority = y_p_train[~y_p_train.isin(minority_classes)]
X_p_minority = np.array(X_p_train)[y_p_train.isin(minority_classes)]
y_p_minority = y_p_train[y_p_train.isin(minority_classes)]


from imblearn.over_sampling import SMOTE
sm = SMOTE(sampling_strategy='auto', k_neighbors=7, random_state=42)
X_p_resampled_majority, y_resampled_majority = sm.fit_resample(X_p_majority, y_p_majority)

X_p_combined = np.vstack((X_p_resampled_majority, X_p_minority))
y_p_combined = np.concatenate((y_resampled_majority, y_p_minority))

sm_minority = SMOTE(sampling_strategy='auto', k_neighbors=3, random_state=42)
X_p_final_resampled, y_p_final_resampled = sm_minority.fit_resample(X_p_combined, y_p_combined)

shuffled_indices = np.random.permutation(len(X_p_final_resampled))

# Shuffle both X_ps and y_ps using the same shuffled indices
X_ps_shuffled = X_p_final_resampled[shuffled_indices]
y_ps_shuffled = y_p_final_resampled[shuffled_indices]

X_ps = X_ps_shuffled
y_ps = y_ps_shuffled


In [None]:

X_p_train = np.array(X_ps)
X_p_test = np.array(X_p_test)
y_p_train = y_ps
X_p_train.shape

In [None]:
from xgboost import XGBClassifier
classifier2 = XGBClassifier()
classifier2.fit(X_p_train, y_p_train)
from sklearn.metrics import confusion_matrix, accuracy_score
y_p_pred = classifier2.predict(X_p_test)
cm = confusion_matrix(y_p_test, y_p_pred)
print(cm)

print(classification_report(y_p_test, y_p_pred))

In [None]:

class_names = ["alcoholic beverages",
"cereals and bakery products",
"cocoa and cocoa preparations, coffee and tea",
"confectionery",
"dietetic foods, food supplements, fortified foods",
"fats and oils",
"feed materials",
"food additives and flavourings",
"food contact materials",
"fruits and vegetables",
"herbs and spices",
"honey and royal jelly",
"ices and desserts",
"meat, egg and dairy products",
"non-alcoholic beverages",
"nuts, nut products and seeds",
"other food product / mixed",
"pet feed",
"prepared dishes and snacks",
"seafood",
"soups, broths, sauces and condiments",
"sugars and syrups"]


In [None]:
xgboost_pred2 = classifier2.predict(vx)
predicted_class_names = [class_names[i] for i in xgboost_pred2]
predicted_class_names

In [None]:
predicted_class_df['product-category'] = predicted_class_names
predicted_class_df.head()

In [None]:

predicted_class_df.to_csv('submission.csv', index=False)