In [108]:
import pandas as pd
import re
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/max/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/max/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [109]:
wn = nltk.WordNetLemmatizer()
wn.lemmatize("founded")

'founded'

In [110]:
df = pd.read_csv("words.csv")

In [111]:
def split_pascal_case(input_string):
    words = re.findall(r'[A-Z][a-z]*', input_string)
    return ' '.join(words)

def replace_punctuation_with_space(input_string):
    return re.sub(r'[\.\!\-\_]', ' ', input_string)


def split_string(input_string, chunk_size):
    return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)]

def clean_text(input_string):
    no_symbols = replace_punctuation_with_space(input_string).lower()
    no_symbols = no_symbols.replace(" ", "")
    # tokenized = re.split("\W+", no_symbols.lower())
    tokenized = split_string(no_symbols, 3)
    return tokenized

In [112]:
# count_vect = CountVectorizer(ngram_range=(1,3), analyzer=clean_text)
count_vect = TfidfVectorizer(analyzer=clean_text)
vectorizer = count_vect.fit(df['text'])
X = vectorizer.transform(df['text'])

In [113]:
tokenized_df = pd.DataFrame(X.toarray(), columns=count_vect.get_feature_names())
tokenized_df.head()



Unnamed: 0,1,2,],aba,aco,act,ad,add,ade,ado,...,y,ycl,yco,yop,ys,z,zat,zip,zon,élé
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
from sklearn.preprocessing import LabelEncoder

y_labels = df['label'].drop_duplicates().to_list()
y_labels

['ref',
 'address',
 'housenumber',
 'street',
 'city',
 'state',
 'country',
 'postcode',
 'email',
 'opening_hours',
 'phone',
 'lat',
 'lon',
 'location',
 'unknown']

In [115]:
l_encoder = LabelEncoder()
l_encoder.fit(y_labels)

y = l_encoder.transform(df['label'])
l_encoder.classes_

array(['address', 'city', 'country', 'email', 'housenumber', 'lat',
       'location', 'lon', 'opening_hours', 'phone', 'postcode', 'ref',
       'state', 'street', 'unknown'], dtype='<U13')

In [116]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import CategoricalNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [117]:
# Разделение данных на тренировочный и тестовый наборы
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [118]:
# k = 1  # Количество соседей для классификации
# model = KNeighborsClassifier(n_neighbors=k)
# model = RandomForestClassifier()
model = DecisionTreeClassifier()
# model = CategoricalNB()

In [119]:
# Обучение модели
model.fit(X_train, y_train)

# Предсказание на тестовом наборе
y_pred = model.predict(X_test)

In [120]:
# Оценка точности модели
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="weighted")
recall = recall_score(y_test, y_pred, average="weighted")
f1 = f1_score(y_test, y_pred, average="weighted")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1:", f1)

Accuracy: 0.7721518987341772
Precision: 0.8305779312108426
Recall: 0.7721518987341772
F1: 0.7623291434383473


  _warn_prf(average, modifier, msg_start, len(result))


In [121]:
prediction = model.predict(vectorizer.transform(["str"]))
l_encoder.classes_[prediction[0]]

'unknown'

In [122]:
def predict(word):
    prediction = model.predict(vectorizer.transform([word]))
    return l_encoder.classes_[prediction[0]]

In [123]:
params = [
    "addr",
    "features",
    "location",
    "address",
    "addr_full",
    "state",
    "monday",
    "work",
    "content"
]
for row in params:
    print(f"{row}: {predict(row)}")

addr: address
features: unknown
location: location
address: address
addr_full: address
state: state
monday: opening_hours
work: opening_hours
content: phone


In [126]:
import pickle

with open('model_autoparse/model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('model_autoparse/vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)

with open('model_autoparse/label_encoder.pkl', 'wb') as label_encoder_file:
    pickle.dump(l_encoder, label_encoder_file)