In [None]:
import numpy as np
import pandas as pd
from google.colab import drive
import scipy.sparse as sp

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import recall_score, precision_score, f1_score

In [None]:
drive.mount('/content/MyDrive')
file = '/content/MyDrive/MyDrive/macula-greek.tsv'

Mounted at /content/MyDrive


In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
data = pd.read_csv(file, sep='\t', keep_default_na=False)

In [None]:
data.head()

Unnamed: 0,xml:id,ref,role,class,type,gloss,text,after,lemma,normalized,strong,morph,person,number,gender,case,tense,voice,mood,degree,domain,ln,frame,subjref,referent
0,n40001001001,MAT 1:1!1,,noun,common,[The] book,Βίβλος,,βίβλος,Βίβλος,976,N-NSF,,singular,feminine,nominative,,,,,033005,33.38,,,
1,n40001001002,MAT 1:1!2,,noun,common,of [the] genealogy,γενέσεως,,γένεσις,γενέσεως,1078,N-GSF,,singular,feminine,genitive,,,,,010002 033003,10.24 33.19,,,
2,n40001001003,MAT 1:1!3,,noun,proper,of Jesus,Ἰησοῦ,,Ἰησοῦς,Ἰησοῦ,2424,N-GSM,,singular,masculine,genitive,,,,,093001,93.169a,,,
3,n40001001004,MAT 1:1!4,,noun,proper,Christ,Χριστοῦ,,Χριστός,Χριστοῦ,5547,N-GSM,,singular,masculine,genitive,,,,,093001,93.387,,,
4,n40001001005,MAT 1:1!5,,noun,common,son,υἱοῦ,,υἱός,υἱοῦ,5207,N-GSM,,singular,masculine,genitive,,,,,010002,10.30,,,


In [None]:
clean_data = data.drop(['xml:id', 'ref', 'role', 'gloss', 'text', 'after', 'strong', 'morph', 'person', 'number', 'gender', 'case', 'tense', 'voice', 'mood', 'degree', 'frame', 'subjref', 'referent'], axis = 1)

In [None]:
clean_data.head()

Unnamed: 0,class,type,lemma,normalized,domain,ln
0,noun,common,βίβλος,Βίβλος,033005,33.38
1,noun,common,γένεσις,γενέσεως,010002 033003,10.24 33.19
2,noun,proper,Ἰησοῦς,Ἰησοῦ,093001,93.169a
3,noun,proper,Χριστός,Χριστοῦ,093001,93.387
4,noun,common,υἱός,υἱοῦ,010002,10.30


In [None]:
vectorizer = CountVectorizer()

# **LEMMAS**

In [None]:
lemma_features = vectorizer.fit_transform(clean_data.lemma)
print(f'shape = {lemma_features.shape}')
print(f'vocabulary:\n {vectorizer.vocabulary_}')
X_train, X_test, y_train, y_test = train_test_split(lemma_features, clean_data.domain,
                                                    test_size = 0.3)
print(f'x_train shape = {X_train.shape}')
print(f'x_test shape = {X_test.shape}')

shape = (92448, 3473)
vocabulary:
 {'βίβλος': 65, 'γένεσις': 161, 'ἰησοῦς': 3152, 'χριστός': 2252, 'υἱός': 2095, 'δαυίδ': 270, 'ἀβραάμ': 2301, 'ἰσαάκ': 3166, 'δέ': 234, 'ἰακώβ': 3140, 'ἰούδας': 3163, 'καί': 614, 'ἀδελφός': 2337, 'αὐτός': 44, 'φάρες': 2099, 'ζάρα': 485, 'ἐκ': 2803, 'θαμάρ': 527, 'ἑσρώμ': 3021, 'ἀράμ': 2564, 'ἀμιναδάβ': 2416, 'ναασσών': 1168, 'σαλμών': 1705, 'βόες': 144, 'ῥαχάβ': 3449, 'ἰωβήδ': 3186, 'ῥούθ': 3457, 'ἰεσσαί': 3149, 'βασιλεύς': 92, 'σολομών': 1785, 'οὐρίας': 1273, 'ῥοβοάμ': 3454, 'ἀβιά': 2297, 'ἀσάφ': 2599, 'ἰωσαφάτ': 3195, 'ἰωράμ': 3190, 'ὀζίας': 3238, 'ἰωαθάμ': 3184, 'ἀχάζ': 2644, 'ἑζεκίας': 2992, 'μανασσῆ': 1044, 'ἀμώς': 2424, 'ἰωσίας': 3194, 'ἰεχονίας': 3151, 'ἐπί': 2900, 'μετοικεσία': 1095, 'βαβυλών': 68, 'μετά': 1089, 'σαλαθιήλ': 1701, 'ζοροβαβέλ': 503, 'ἀβιούδ': 2300, 'ἐλιακίμ': 2841, 'ἀζώρ': 2350, 'σαδώκ': 1698, 'ἀχίμ': 2646, 'ἐλιούδ': 2842, 'ἐλεάζαρ': 2832, 'ματθάν': 1057, 'ἰωσήφ': 3192, 'ἀνήρ': 2443, 'μαρία': 1047, 'ὅς': 3339, 'πᾶς': 1667, 'οὖν': 

In [None]:
lemma_classifier = MultinomialNB()
lemma_classifier.fit(X_train, y_train)

MultinomialNB()

In [None]:
lemma_correct = (y_test == lemma_classifier.predict(X_test)).sum()
print(f'{lemma_correct} correctly classified')
lemma_incorrect = y_test.size - lemma_correct
print(f'{lemma_incorrect} incorrectly classified')
print(f'accuracy = {lemma_classifier.score(X_test, y_test)}')
recall = recall_score(y_test, lemma_classifier.predict(X_test), average='weighted')
print(f'recall = {recall}')
precision = precision_score(y_test, lemma_classifier.predict(X_test), average='weighted')
print(f'precision = {precision}')
f1 = f1_score(y_test, lemma_classifier.predict(X_test), average='weighted')
print(f'f1 = {f1}')

14445 correctly classified
13290 incorrectly classified
accuracy = 0.5208220659816117
recall = 0.5208220659816117


  _warn_prf(average, modifier, msg_start, len(result))


precision = 0.43441438552541295
f1 = 0.42898982251181356


# **CLASS**

In [None]:
class_features = vectorizer.fit_transform(clean_data['class'])
print(f'shape = {class_features.shape}')
print(f'vocabulary:\n {vectorizer.vocabulary_}')
X_train, X_test, y_train, y_test = train_test_split(class_features, clean_data.domain,
                                                    test_size = 0.3)
print(f'x_train shape = {X_train.shape}')
print(f'x_test shape = {X_test.shape}')

shape = (92448, 11)
vocabulary:
 {'noun': 5, 'det': 3, 'conj': 2, 'pron': 8, 'prep': 7, 'adj': 0, 'adv': 1, 'verb': 10, 'ptcl': 9, 'num': 6, 'intj': 4}
x_train shape = (64713, 11)
x_test shape = (27735, 11)


In [None]:
class_classifier = MultinomialNB()
class_classifier.fit(X_train, y_train)

MultinomialNB()

In [None]:
class_correct = (y_test == class_classifier.predict(X_test)).sum()
print(f'{class_correct} correctly classified')
class_incorrect = y_test.size - class_correct
print(f'{class_incorrect} incorrectly classified')
print(f'accuracy = {class_classifier.score(X_test, y_test)}')
recall = recall_score(y_test, class_classifier.predict(X_test), average='weighted')
print(f'recall = {recall}')
precision = precision_score(y_test, class_classifier.predict(X_test), average='weighted')
print(f'precision = {precision}')
f1 = f1_score(y_test, class_classifier.predict(X_test), average='weighted')
print(f'f1 = {f1}')

9955 correctly classified
17780 incorrectly classified
accuracy = 0.35893275644492517
recall = 0.35893275644492517


  _warn_prf(average, modifier, msg_start, len(result))


precision = 0.2299287656169468
f1 = 0.26273616058718846


# **TYPE**

In [None]:
type_features = vectorizer.fit_transform(clean_data['type'])
print(f'shape = {type_features.shape}')
print(f'vocabulary:\n {vectorizer.vocabulary_}')
X_train, X_test, y_train, y_test = train_test_split(type_features, clean_data.domain,
                                                    test_size = 0.3)
print(f'x_train shape = {X_train.shape}')
print(f'x_test shape = {X_test.shape}')

shape = (92448, 9)
vocabulary:
 {'common': 1, 'proper': 7, 'personal': 5, 'relative': 8, 'demonstrative': 2, 'interrogative': 4, 'possessive': 6, 'indefinite': 3, 'adverbial': 0}
x_train shape = (64713, 9)
x_test shape = (27735, 9)


In [None]:
type_classifier = MultinomialNB()
type_classifier.fit(X_train, y_train)

MultinomialNB()

In [None]:
type_correct = (y_test == type_classifier.predict(X_test)).sum()
print(f'{type_correct} correctly classified')
type_incorrect = y_test.size - type_correct
print(f'{type_incorrect} incorrectly classified')
print(f'accuracy = {type_classifier.score(X_test, y_test)}')
recall = recall_score(y_test, type_classifier.predict(X_test), average='weighted')
print(f'recall = {recall}')
precision = precision_score(y_test, type_classifier.predict(X_test), average='weighted')
print(f'precision = {precision}')
f1 = f1_score(y_test, type_classifier.predict(X_test), average='weighted')
print(f'f1 = {f1}')

8390 correctly classified
19345 incorrectly classified
accuracy = 0.3025058590228953
recall = 0.3025058590228953


  _warn_prf(average, modifier, msg_start, len(result))


precision = 0.12461808114223706
f1 = 0.15887760865759407


# **NORMALIZED**

In [None]:
normalized_features = vectorizer.fit_transform(clean_data['normalized'])
print(f'shape = {normalized_features.shape}')
print(f'vocabulary:\n {vectorizer.vocabulary_}')
X_train, X_test, y_train, y_test = train_test_split(normalized_features, clean_data.domain,
                                                    test_size = 0.3)
print(f'x_train shape = {X_train.shape}')
print(f'x_test shape = {X_test.shape}')

shape = (92448, 7521)
vocabulary:
 {'βίβλος': 133, 'γενέσεως': 389, 'ἰησοῦ': 6874, 'χριστοῦ': 4923, 'υἱοῦ': 4596, 'δαυείδ': 582, 'ἀβραάμ': 5033, 'τόν': 4569, 'ἰσαάκ': 6908, 'δέ': 520, 'ἰακώβ': 6843, 'ἰούδαν': 6902, 'καί': 1319, 'τούς': 4477, 'ἀδελφούς': 5130, 'αὐτοῦ': 94, 'φαρές': 4636, 'ζαρά': 1058, 'ἐκ': 6073, 'τῆς': 4585, 'θάμαρ': 1120, 'ἐσρώμ': 6466, 'ἀράμ': 5589, 'ἀμιναδάβ': 5302, 'ναασσών': 2488, 'σαλμών': 3752, 'βόες': 319, 'ῥαχάβ': 7481, 'ἰωβήδ': 6952, 'ῥούθ': 7494, 'ἰεσσαί': 6871, 'βασιλέα': 189, 'σολομῶνα': 3912, 'τοῦ': 4486, 'οὐρίου': 2731, 'ῥοβοάμ': 7488, 'ἀβιά': 5029, 'ἀσάφ': 5681, 'ἰωσαφάτ': 6960, 'ἰωράμ': 6956, 'ὀζείαν': 7056, 'ἰωαθάμ': 6950, 'ἄχαζ': 6007, 'ἐζεκίαν': 6064, 'μανασσῆ': 2193, 'ἀμώς': 5322, 'ἰωσείαν': 6961, 'ἰεχονίαν': 6873, 'ἐπί': 6301, 'μετοικεσίας': 2315, 'βαβυλῶνος': 144, 'μετά': 2305, 'τήν': 4285, 'μετοικεσίαν': 2314, 'σαλαθιήλ': 3748, 'ζοροβαβέλ': 1085, 'ἀβιούδ': 5032, 'ἐλιακείμ': 6176, 'ἀζώρ': 5164, 'σαδώκ': 3745, 'ἀχείμ': 5792, 'ἐλιούδ': 6177, 'ἐλεάζ

In [None]:
normalized_classifier = MultinomialNB()
normalized_classifier.fit(X_train, y_train)

MultinomialNB()

In [None]:
normalized_correct = (y_test == normalized_classifier.predict(X_test)).sum()
print(f'{normalized_correct} correctly classified')
normalized_incorrect = y_test.size - normalized_correct
print(f'{normalized_incorrect} incorrectly classified')
print(f'accuracy = {normalized_classifier.score(X_test, y_test)}')
recall = recall_score(y_test, normalized_classifier.predict(X_test), average='weighted')
print(f'recall = {recall}')
precision = precision_score(y_test, normalized_classifier.predict(X_test), average='weighted')
print(f'precision = {precision}')
f1 = f1_score(y_test, normalized_classifier.predict(X_test), average='weighted')
print(f'f1 = {f1}')

13199 correctly classified
14536 incorrectly classified
accuracy = 0.47589688119704343
recall = 0.47589688119704343


  _warn_prf(average, modifier, msg_start, len(result))


precision = 0.44353752283532133
f1 = 0.37538243250478565


# **COMBINED**

In [None]:
combined_features = sp.hstack([lemma_features, class_features, type_features, normalized_features]
                     , format = 'csr')

In [None]:
X_train, X_test, y_train, y_test = train_test_split(combined_features, clean_data.domain,
                                                    test_size = 0.3)

In [None]:
combined_classifier = MultinomialNB()
combined_classifier.fit(X_train, y_train)

MultinomialNB()

In [None]:
print(f'x_test shape = {X_test.shape}')

x_test shape = (27735, 11014)


In [None]:
combined_correct = (y_test == combined_classifier.predict(X_test)).sum()
print(f'{combined_correct} correctly classified')
combined_incorrect = y_test.size - combined_correct
print(f'{combined_incorrect} incorrectly classified')
print(f'accuracy = {combined_classifier.score(X_test, y_test)}')
recall = recall_score(y_test, combined_classifier.predict(X_test), average='weighted')
print(f'recall = {recall}')
precision = precision_score(y_test, combined_classifier.predict(X_test), average='weighted')
print(f'precision = {precision}')
f1 = f1_score(y_test, combined_classifier.predict(X_test), average='weighted')
print(f'f1 = {f1}')

16529 correctly classified
11206 incorrectly classified
accuracy = 0.5959617811429602
recall = 0.5959617811429602


  _warn_prf(average, modifier, msg_start, len(result))


precision = 0.5487655945380399
f1 = 0.5412370218925308
