In [143]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import tiktoken

#### Load dataset

In [144]:
df = pd.read_csv("words2.csv")

In [145]:
encoding = tiktoken.get_encoding("cl100k_base")

#### Tokenizer

In [146]:
def split_pascal_case(input_string):
    words = re.findall(r'[A-Z][a-z]*', input_string)
    return ' '.join(words)

def replace_punctuation_with_space(input_string):
    return re.sub(r'[\.\!\-\_]', ' ', input_string)


def split_string(input_string, chunk_size):
    return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)]

def clean_text(input_string):
    no_symbols = replace_punctuation_with_space(input_string).lower()
    no_symbols = no_symbols.replace(" ", "")
    # tokenized = re.split("\W+", no_symbols.lower())
    tokenized = split_string(no_symbols, 3)
    return tokenized

def tokenize(input_string):
    def replace_punctuation_with_space(input_string):
        return re.sub(r'[\.\!\-\_]', ' ', input_string)

    no_symbols = replace_punctuation_with_space(input_string).lower()
    no_symbols = no_symbols.replace(" ", "")
    tokens = encoding.encode(no_symbols)
    return [encoding.decode_single_token_bytes(token).decode("utf-8")  for token in tokens]

In [147]:
df["length"] = df['text'].apply(len)

#### Vectorization

In [148]:
# count_vect = CountVectorizer(ngram_range=(1,3), analyzer=clean_text)
count_vect = TfidfVectorizer(analyzer=tokenize)

vectorizer = count_vect.fit(df['text'])
X = vectorizer.transform(df['text'])

tokenized_df = pd.DataFrame(X.toarray(), columns=count_vect.get_feature_names())
tokenized_df.head()



Unnamed: 0,1,2,3,[,[l,],ac,ach,ad,add,...,wo,work,working,works,x,y,z,zip,zipcode,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### Labels encoding

In [149]:
from sklearn.preprocessing import LabelEncoder

y_labels = df['label'].drop_duplicates().to_list()

l_encoder = LabelEncoder()
l_encoder.fit(y_labels)

y = l_encoder.transform(df['label'])
l_encoder.classes_

array(['address', 'city', 'country', 'email', 'housenumber', 'lat',
       'location', 'lon', 'opening_hours', 'phone', 'placename',
       'postcode', 'ref', 'state', 'store_url', 'street', 'unknown'],
      dtype='<U13')

#### Model creation

In [150]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [151]:
# Разделение данных на тренировочный и тестовый наборы
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
df_test = pd.read_csv("words2_test.csv")

X_test = vectorizer.transform(df_test['text'])
y_test = l_encoder.transform(df_test['label'])

X_train = X
y_train = y

X_test

<19x336 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [152]:
# k = 5  # Количество соседей для классификации
# model = KNeighborsClassifier(n_neighbors=k)
# model = RandomForestClassifier()
model = DecisionTreeClassifier()
# model = CategoricalNB()

In [153]:
# Обучение модели
model.fit(X, y_train)

# Предсказание на тестовом наборе
y_pred = model.predict(X_test)

In [154]:
# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.8421052631578947
Precision: 0.8578947368421053
Recall: 0.8421052631578947
F1: 0.8236842105263158


  _warn_prf(average, modifier, msg_start, len(result))


#### Model testing 

In [155]:
prediction = model.predict(vectorizer.transform(["country"]))
l_encoder.classes_[prediction[0]]

'country'

In [156]:
def predict(word):
    prediction = model.predict(vectorizer.transform([word]))
    return l_encoder.classes_[prediction[0]]

In [157]:
params = [
    "addr",
    "features",
    "location",
    "address",
    "addr_full",
    "state",
    "monday",
    "work",
    "content",
    "country",
    "website",
    "webpage",
    "url",
]
for row in params:
    print(f"{row}: {predict(row)}")

addr: address
features: location
location: location
address: address
addr_full: address
state: state
monday: opening_hours
work: opening_hours
content: unknown
country: country
website: store_url
webpage: store_url
url: store_url


#### Model saving

In [158]:
import pickle

with open('model_autoparse/model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('model_autoparse/vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

with open('model_autoparse/label_encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(l_encoder, label_encoder_file)