In [244]:
import re
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from keras.utils import to_categorical
import tiktoken

In [245]:
df = pd.read_csv("words2.csv")

In [246]:
encoding = tiktoken.get_encoding("cl100k_base")

In [247]:
def clean_text(input_string):
    def replace_punctuation_with_space(input_string):
        return re.sub(r'[\.\!\-\_]', ' ', input_string)

    def split_string(input_string, chunk_size):
        return [input_string[i:i+chunk_size] for i in range(0, len(input_string), chunk_size)]

    no_symbols = replace_punctuation_with_space(input_string).lower()
    no_symbols = no_symbols.replace(" ", "")
    # tokenized = re.split("\W+", no_symbols.lower())
    tokenized = split_string(no_symbols, 3)
    return tokenized

def tokenize(input_string):
    def replace_punctuation_with_space(input_string):
        return re.sub(r'[\.\!\-\_]', ' ', input_string)
    
    no_symbols = replace_punctuation_with_space(input_string).lower()
    no_symbols = no_symbols.replace(" ", "")
    tokens = encoding.encode(no_symbols)
    return [encoding.decode_single_token_bytes(token).decode("utf-8")  for token in tokens]

In [248]:
# count_vect = CountVectorizer(ngram_range=(1,1), analyzer=clean_text)
count_vect = TfidfVectorizer(analyzer=tokenize)

vectorizer = count_vect.fit(df['text'])
X = vectorizer.transform(df['text'])

tokenized_df = pd.DataFrame(X.toarray(), columns=count_vect.get_feature_names())
tokenized_df.head()
# tokenized_df.to_csv("tokenized.csv")



Unnamed: 0,1,2,3,[,[l,],ac,ach,ad,add,...,wh,work,working,works,x,y,z,zip,zipcode,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [249]:
y_labels = df['label'].drop_duplicates().to_list()

l_encoder = LabelEncoder()
l_encoder.fit(y_labels)

y = l_encoder.transform(df['label'])
print(l_encoder.classes_)

y = to_categorical(y)
y.shape

['address' 'city' 'country' 'email' 'housenumber' 'lat' 'location' 'lon'
 'opening_hours' 'phone' 'placename' 'postcode' 'ref' 'state' 'store_url'
 'street' 'unknown']


(256, 17)

In [250]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import gradient_descent_v2
from keras.losses import categorical_crossentropy
from keras.metrics import Accuracy, Precision, accuracy
from sklearn.model_selection import train_test_split

In [251]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [252]:
val, input_n = X_train.shape
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(192, 296)
(64, 296)
(192, 17)
(64, 17)


In [253]:
hidden_n = 50
model = Sequential()
model.add(Dense(hidden_n, activation="relu", input_shape=(input_n,)))
model.add(Dense(17, activation="softmax", input_shape=(hidden_n,)))

optimizer = gradient_descent_v2.SGD(learning_rate=0.3)
model.compile(
    optimizer=optimizer,
    metrics=[
        Accuracy(),
        Precision(),
    ],
    loss=categorical_crossentropy
)
model.summary()

Model: "sequential_18"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_36 (Dense)            (None, 50)                14850     
                                                                 
 dense_37 (Dense)            (None, 17)                867       
                                                                 
Total params: 15,717
Trainable params: 15,717
Non-trainable params: 0
_________________________________________________________________


In [254]:
model.fit(X_train.toarray(), y_train, epochs=100)

Epoch 1/100


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<keras.callbacks.History at 0x7fcc704ca0b0>

In [255]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

predictions = model.predict(X_test.toarray())

y_pred = np.array([np.argmax(pred) for pred in predictions])
y_test_a = np.array([np.argmax(pred) for pred in y_test])


# assert y_test_a.shape == y_pred.shape
print(accuracy_score(y_test_a, y_pred))
print(precision_score(y_test_a, y_pred, average="weighted"))
print(f1_score(y_test_a, y_pred, average="weighted"))

0.609375
0.7238219246031746
0.6323128883500401


  _warn_prf(average, modifier, msg_start, len(result))


In [256]:
word_vect = vectorizer.transform(["addr"]).toarray()

prediction = model.predict(word_vect)

print(l_encoder.classes_[np.argmax(prediction[0])])
list(zip(l_encoder.classes_, prediction[0]))

unknown


[('address', 0.038460538),
 ('city', 0.05982227),
 ('country', 0.011236495),
 ('email', 0.07905212),
 ('housenumber', 0.049412135),
 ('lat', 0.04225049),
 ('location', 0.113736205),
 ('lon', 0.05541592),
 ('opening_hours', 0.06784123),
 ('phone', 0.07810695),
 ('placename', 0.023935769),
 ('postcode', 0.04726912),
 ('ref', 0.059166174),
 ('state', 0.05742465),
 ('store_url', 0.045563933),
 ('street', 0.036713567),
 ('unknown', 0.13459249)]