In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

In [5]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
import re
import string

def trim(transient_tweet_text):

	''' 
	trim leading and trailing spaces in the tweet text
	'''
	return transient_tweet_text.strip()

def strip_whiteSpaces(transient_tweet_text):
	'''
	Strip all white spaces
	'''
	transient_tweet_text = re.sub(r'[\s]+', ' ', transient_tweet_text)
	return transient_tweet_text

def preprocess_string(text : str) ->str:
    # Ukloni znake interpunkcije
    translator = str.maketrans('', '', string.punctuation)
    text_without_punctuation = text.translate(translator)
    
    # Ukloni šifre - bilo koja reč koja sadrži i velika slova i brojeve
    # pattern = r'\b[A-Z]+[0-9]+[A-Z]+[A-Z0-9]*\b'
	
    pattern = r'\b([A-Z]+[0-9]+[A-Z]+[A-Z0-9]*|[0-9]+[A-Z]+[0-9]*[A-Z0-9]*)\b'
    text_without_codes = re.sub(pattern, '', text_without_punctuation)
    
    # Pretvori sve u mala slova
    result = text_without_codes.lower()
    
    # Ukloni višestruke razmake
    result = trim(result)
    
    return strip_whiteSpaces(result)

In [7]:
X_train = df_train['naziv'].apply(lambda x : preprocess_string(x))
X_test = df_test['naziv'].apply(lambda x : preprocess_string(x))

y_train = df_train['kategorija']
y_test = df_test['kategorija']

In [8]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))

X_train_encoded = vectorizer.fit_transform(X_train) 
X_test_encoded = vectorizer.transform(X_test)

labeler = LabelEncoder()
y_train_encoded = labeler.fit_transform(y_train)
y_test_encoded = labeler.transform(y_test)

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(class_weight='balanced')

model.fit(X_train_encoded,y_train_encoded)

y_pred = model.predict(X_test_encoded)


In [10]:
y_pred

array([1, 5, 3, ..., 3, 9, 8])

In [11]:
from sklearn.metrics import f1_score, accuracy_score
import numpy as np

def compute_metrics(predictions, labels):
    # predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    score = f1_score(labels, predictions, average="weighted")
    
    return {"f1": score, "accuracy": accuracy}

In [12]:
a = compute_metrics(y_pred,y_test_encoded)

In [13]:
a['f1']
 

0.9817665943794723

In [14]:
a['accuracy']

0.9817493879367906

In [15]:

key_to_category = {}
for i, key in enumerate(y_test_encoded):
    if key not in key_to_category:
        key_to_category[key] = y_test[i]

print(key_to_category)

{1: 'bela-tehnika', 5: 'lepota-i-zdravlje', 3: 'it-uredjaji', 8: 'telefonija', 9: 'tv-video-i-foto-tehnika', 4: 'kucni-aparati', 0: 'alati-i-bastenska-oprema', 6: 'pokucstvo', 7: 'sport-i-rekreacija', 2: 'grejanje-i-klimatizacija'}


In [34]:
key_to_category


{1: 'bela-tehnika',
 5: 'lepota-i-zdravlje',
 3: 'it-uredjaji',
 8: 'telefonija',
 9: 'tv-video-i-foto-tehnika',
 4: 'kucni-aparati',
 0: 'alati-i-bastenska-oprema',
 6: 'pokucstvo',
 7: 'sport-i-rekreacija',
 2: 'grejanje-i-klimatizacija'}

In [17]:
print(X_test[505])
print(key_to_category[y_pred[505]])

apple mobilni telefon iphone 14 pro max space black mq9p3sxa
telefonija


In [18]:
import numpy as np
unique, counts = np.unique(y_pred, return_counts=True)
print(dict(zip(unique, counts)))

{0: 124, 1: 964, 2: 202, 3: 1233, 4: 663, 5: 303, 6: 199, 7: 240, 8: 293, 9: 272}


In [21]:
from sklearn.metrics import classification_report
print(classification_report(y_test_encoded, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.97      0.98       128
           1       1.00      0.99      0.99       974
           2       0.95      0.98      0.97       195
           3       0.98      0.99      0.99      1222
           4       0.99      0.97      0.98       676
           5       0.95      0.99      0.97       290
           6       0.97      0.96      0.97       202
           7       0.98      0.99      0.98       238
           8       0.99      0.99      0.99       294
           9       0.96      0.95      0.95       274

    accuracy                           0.98      4493
   macro avg       0.98      0.98      0.98      4493
weighted avg       0.98      0.98      0.98      4493



In [22]:
import pandas as pd

def check_train_test_overlap(train_path, test_path, key_columns=None):
    # Load datasets
    train = pd.read_csv(train_path)
    test = pd.read_csv(test_path)
    
    # Use all columns if key_columns not specified
    if key_columns is None:
        key_columns = train.columns.tolist()
    
    # Find duplicates
    duplicates = pd.merge(train, test, how='inner', on=key_columns)
    
    # Prepare results
    results = {
        'train_samples': len(train),
        'test_samples': len(test),
        'duplicate_samples': len(duplicates),
        'duplicate_percentage': len(duplicates) / len(test) * 100,
        'duplicate_columns': key_columns,
        'duplicate_examples': duplicates.head(3).to_dict('records') if len(duplicates) > 0 else None
    }
    
    return results



In [23]:
check_train_test_overlap('train.csv','test.csv')

{'train_samples': 18222,
 'test_samples': 4493,
 'duplicate_samples': 0,
 'duplicate_percentage': 0.0,
 'duplicate_columns': ['naziv', 'kategorija'],
 'duplicate_examples': None}

In [27]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [preprocess_string(text) for text in X]

# Kreiranje pipeline-a
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', vectorizer),
    ('classifier', model)
])

In [32]:
key_to_category[pipeline.predict(['Bosch Rerna ugradna']).item()]

'bela-tehnika'

In [33]:
import joblib

# Pretpostavimo da je 'pipeline' tvoj trenirani pipeline
joblib.dump(pipeline, 'pipeline.pkl')

['pipeline.pkl']