In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

from utils import prepropcess_data
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ataka\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
data = pd.read_csv("data/train_40k.csv")
data_test = pd.read_csv("data/val_10k.csv")

df = pd.concat([data, data_test])

df.head()

Unnamed: 0,productId,Title,userId,Helpfulness,Score,Time,Text,Cat1,Cat2,Cat3
0,B000E46LYG,Golden Valley Natural Buffalo Jerky,A3MQDNGHDJU4MK,0/0,3.0,-1,The description and photo on this product need...,grocery gourmet food,meat poultry,jerky
1,B000GRA6N8,Westing Game,unknown,0/0,5.0,860630400,This was a great book!!!! It is well thought t...,toys games,games,unknown
2,B000GRA6N8,Westing Game,unknown,0/0,5.0,883008000,"I am a first year teacher, teaching 5th grade....",toys games,games,unknown
3,B000GRA6N8,Westing Game,unknown,0/0,5.0,897696000,I got the book at my bookfair at school lookin...,toys games,games,unknown
4,B00000DMDQ,I SPY A is For Jigsaw Puzzle 63pc,unknown,2/4,5.0,911865600,Hi! I'm Martine Redman and I created this puzz...,toys games,puzzles,jigsaw puzzles


In [12]:
X = df["Text"]  # Текст отзывов 

encoder1 = LabelEncoder()
encoder2 = LabelEncoder()
encoder3 = LabelEncoder()

y_cat1 = df["Cat1"]
y_cat2 = df["Cat2"]
y_cat3 = df["Cat3"]

# Преобразуем категориальные признаки в числовые
y_cat1_encode = encoder1.fit_transform(y_cat1)
y_cat2_encode = encoder2.fit_transform(y_cat2)
y_cat3_encode = encoder3.fit_transform(y_cat3)

# Создаем словари для каждого столбца
mapping_cat1 = dict(zip(encoder1.transform(encoder1.classes_), encoder1.classes_))
mapping_cat2 = dict(zip(encoder2.transform(encoder2.classes_), encoder2.classes_))
mapping_cat3 = dict(zip(encoder3.transform(encoder3.classes_), encoder3.classes_))

In [13]:
preprocces = pd.DataFrame()
preprocces['text'] = X
preprocces = prepropcess_data(preprocces)

In [14]:
X = preprocces['final_text']
X.head()

0    description photo product need change indicate...
1    great book well thought easily imagine event h...
2    first year teacher teach grade special reading...
3    get book bookfair school look something summer...
4    hi martine redman create puzzle briarpatch use...
Name: final_text, dtype: object

In [15]:
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(X)

In [39]:
import pickle
with open('tfidf_model.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [16]:
X_train, X_test, y_cat1_encode_train, y_cat1_encode_test, y_cat2_encode_train, y_cat2_encode_test, y_cat3_encode_train, y_cat3_encode_test = train_test_split(X_tfidf, y_cat1_encode, y_cat2_encode, y_cat3_encode, test_size=0.2, random_state=42)

In [38]:
import pickle
with open("variables.pickle", "wb") as f:
    pickle.dump([data, mapping_cat1, mapping_cat2, mapping_cat3], f)

# Загрузка
with open("variables.pickle", "rb") as f:
    loaded_data, loaded_map1, loaded_map2, loaded_map3 = pickle.load(f)

print(loaded_data)  # Вывод: [1, 2, 3]
print(loaded_map1)  # Вывод: {'a': 1, 'b': 2}

        productId                                              Title  \
0      B000E46LYG                Golden Valley Natural Buffalo Jerky   
1      B000GRA6N8                                       Westing Game   
2      B000GRA6N8                                       Westing Game   
3      B000GRA6N8                                       Westing Game   
4      B00000DMDQ                  I SPY A is For Jigsaw Puzzle 63pc   
...           ...                                                ...   
39995  B0006IYND6     Japonesque Silver Lipstick Palette Kit 1 piece   
39996  B000A33FZY  Truform 20-30 Below Knee Closed-Toe, Beige, Small   
39997  B000I7D2L4          Zadro Z300 Wall Mountable Fog Free Mirror   
39998  B000KHKKB2        Opalescent Glitter Lotion - 6.3 oz - Liquid   
39999  B000JFC4C8  Pure Purple by Hugo Boss Eau De Parfum Spray 3...   

               userId Helpfulness  Score        Time  \
0      A3MQDNGHDJU4MK         0/0    3.0          -1   
1             unknown  

In [19]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB 

class HierarchicalClassifier:
    def __init__(self):
        self.base_classifier_cat1 = LogisticRegression()
        self.base_classifier_cat2 = LogisticRegression()
        self.base_classifier_cat3 = LogisticRegression()


    def fit(self, X_train, y_train_cat1, y_train_cat2, y_train_cat3):
        # Обучите базовые классификаторы для каждой категории
        print(X_train.shape, y_train_cat1.shape, y_train_cat2.shape, y_train_cat3.shape)
        self.clf_cat1 = self.base_classifier_cat1.fit(X_train, y_train_cat1)
        print(2)
        self.clf_cat2 = self.base_classifier_cat2.fit(X_train, y_train_cat2)
        print(3)
        self.clf_cat3 = self.base_classifier_cat3.fit(X_train, y_train_cat3)
        

    def find_closest_category_tfidf(self, predicted_category, possible_categories):
        # Создаем объект TF-IDF для векторного представления категорий
        tfidf_vectorizer = TfidfVectorizer()
        
        # Преобразуем категории в векторы TF-IDF
        category_vectors = tfidf_vectorizer.fit_transform(possible_categories + [predicted_category])
        
        # Получаем вектор предсказанной категории
        predicted_vector = category_vectors[-1]

        # Вычисляем косинусное расстояние между вектором предсказанной категории 
        # и векторами всех возможных категорий

        similarities = cosine_similarity(predicted_vector, category_vectors[:-1])

        # Находим индекс категории с максимальным сходством
        closest_index = similarities.argmax()

        return possible_categories[closest_index]


    def predict(self, X_test):
        pred_cat1 = []
        pred_cat2 = []
        pred_cat3 = []

        
        for i in X_test:
            # print()
            # CAT 1
            # print('='*3 + 'CAT1' + '='*3)
            predicted_cat1 = self.clf_cat1.predict(i)[0]
            # print(predicted_cat1)
            name_cat1 = mapping_cat1[predicted_cat1]
            # print(name_cat1)

            possible_cat2 = data[data["Cat1"] == name_cat1]["Cat2"].unique()
            # print(possible_cat2)

            # CAT 2
            # print('='*3 + 'CAT2' + '='*3)
            predicted_cat2 = self.clf_cat2.predict(i)[0]
            # print(predicted_cat2)
            name_cat2 = mapping_cat2[predicted_cat2]
            # print(name_cat2)

            if name_cat2 not in possible_cat2 and len(possible_cat2) != 0:
                # print('='*4)
                # print('level2 -', name_cat1)
                # print('before:', name_cat2)
                if len(possible_cat2) == 1:
                    name_cat2 = possible_cat2[0]
                else:
                    name_cat2 = self.find_closest_category_tfidf(name_cat2, possible_cat2)
                # print('after', name_cat2)

                


            possible_cat3 = data[data["Cat2"] == name_cat2]["Cat3"].unique()
            # print(possible_cat3)

            # CAT 3
            # print('='*3 + 'CAT3' + '='*3)
            predicted_cat3 = self.clf_cat3.predict(i)[0]
            # print(predicted_cat3)
            name_cat3 = mapping_cat3[predicted_cat3]
            # print(name_cat3)

            if name_cat3 not in possible_cat3 and len(possible_cat3) != 0:
                # print('='*4)
                # print('level3 -', name_cat2)
                # print('before:', name_cat3)
                if len(possible_cat3) == 1:
                    name_cat3 = possible_cat3[0]
                else:
                    name_cat3 = self.find_closest_category_tfidf(name_cat3, possible_cat3)
                # print('after', name_cat3)

            pred_cat1.append(name_cat1)
            pred_cat2.append(name_cat2)
            pred_cat3.append(name_cat3)

        return pred_cat1, pred_cat2, pred_cat3
        

In [20]:
hierarchical_clf = HierarchicalClassifier()
hierarchical_clf.fit(X_train, y_cat1_encode_train, y_cat2_encode_train, y_cat3_encode_train)

(40000, 39327) (40000,) (40000,) (40000,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


2


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


3


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [37]:
import pickle

# Save the model
with open('hierarchical_clf.pkl', 'wb') as f:
    pickle.dump(hierarchical_clf, f)

# Load the model
with open('hierarchical_clf.pkl', 'rb') as f:
    loaded_clf = pickle.load(f)

In [21]:
import joblib

joblib.dump(hierarchical_clf, 'hierarchical_clf.joblib')

['hierarchical_clf.joblib']

In [22]:
loaded_clf = joblib.load('hierarchical_clf.joblib')

In [23]:
y_pred_cat1, y_pred_cat2, y_pred_cat3  = hierarchical_clf.predict(X_test)


In [35]:
y_pred_cat1_load, y_pred_cat2_load, y_pred_cat3_load  = loaded_clf.predict(X_test)


In [31]:
y_cat1_test = pd.Series(y_cat1_encode_test).map(mapping_cat1)
y_cat2_test = pd.Series(y_cat2_encode_test).map(mapping_cat2)
y_cat3_test = pd.Series(y_cat3_encode_test).map(mapping_cat3)



In [36]:
accuracy = accuracy_score(y_cat1_test, y_pred_cat1_load)
precision = precision_score(y_cat1_test, y_pred_cat1_load, average="macro")
recall = recall_score(y_cat1_test, y_pred_cat1_load, average="macro")
f1 = f1_score(y_cat1_test, y_pred_cat1_load, average="macro")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8329
Precision: 0.8511713719019832
Recall: 0.8170608739401382
F1-score: 0.8318092521531399


In [30]:
accuracy = accuracy_score(y_cat1_test, y_pred_cat1)
precision = precision_score(y_cat1_test, y_pred_cat1, average="macro")
recall = recall_score(y_cat1_test, y_pred_cat1, average="macro")
f1 = f1_score(y_cat1_test, y_pred_cat1, average="macro")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.8329
Precision: 0.8511713719019832
Recall: 0.8170608739401382
F1-score: 0.8318092521531399


In [32]:
accuracy = accuracy_score(y_cat2_test, y_pred_cat2)
precision = precision_score(y_cat2_test, y_pred_cat2, average="macro")
recall = recall_score(y_cat2_test, y_pred_cat2, average="macro")
f1 = f1_score(y_cat2_test, y_pred_cat2, average="macro")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.6167
Precision: 0.6077771157495653
Recall: 0.42596662861272305
F1-score: 0.4579642828397469


  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
accuracy = accuracy_score(y_cat3_test, y_pred_cat3)
precision = precision_score(y_cat3_test, y_pred_cat3, average="macro")
recall = recall_score(y_cat3_test, y_pred_cat3, average="macro")
f1 = f1_score(y_cat3_test, y_pred_cat3, average="macro")

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")

Accuracy: 0.4522
Precision: 0.2770147552067582
Recall: 0.15886742758372305
F1-score: 0.17970972324487536


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [42]:
with open('app/tfidf_model.pkl', 'rb') as f:
    loaded_vectorizer = pickle.load(f)

text='very good game for child'

text_tfidf = loaded_vectorizer.transform([text])

loaded_clf.predict([text_tfidf])

(['toys games'], ['games'], ['board games'])