In [1]:
# Import Libraries
import re
import numpy as np
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from collections import OrderedDict, Counter
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.svm import LinearSVC
from sklearn.datasets import make_classification
from sklearn.cluster import DBSCAN
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_auc_score

# Extract Transactions

In [2]:
DREHEM_IDS = 'clean_drehem_ids.txt'
QUEEN_ARCHIVES_IDS = 'queen_archives_pids.txt'
QUEEN_OIP_IDS = 'oip_pids.txt'

labels = {}
labels["domesticated_animal"] = ["ox", "cow", "sheep", "goat", "lamb"] # account for plural
labels["wild_animal"] = ["bear", "gazelle", "mountain"] # account for "mountain animal" and plural
labels["dead_animal"] = ["[die]"] # find "die" before finding domesticated or wild
labels["leather_object"] = ["boots", "sandals"]
labels["precious_object"] = ["copper", "bronze", "silver", "gold"]
labels["wool"] = ["wool"]
labels["queens_archive"] = []

class Transaction:
    def __init__(self, p_id):
        self.p_id = p_id
        self.lines = list()
        self.lemmas = OrderedDict() # Maps Sumerian text to its lemmatized form
        self.label = {} # Maps label to List of defining text
        self.sumerian_lemmas = []
        
    # Create mapping of Sumerian text to its lemmatized form
    def get_lemmatization(self):
        first_line = 0
        for i, s in enumerate(self.lines):
            if s.startswith("1."):
                  first_line = i
                  break
        while first_line < len(self.lines)-1:
            if self.lines[first_line] and self.lines[first_line][0].isnumeric() and self.lines[first_line+1].startswith("#lem"):
                self.lemmas[self.lines[first_line]] = self.lines[first_line+1]
                first_line += 2
            else:
                first_line += 1
                
        return self.lemmas
    
    # Get Sumerian lemmatized text only
    def get_sumerian_lemma(self):
        #print(item.sumerian_lemmas)
        item.sumerian_lemmas = []
        for k, v in self.lemmas.items():
            #print(v)
            result = re.findall(" .*\[[a-z]+\]", v)
            if len(result) == 0:
                continue
            lemmas = [s[:s.index("[")].strip() for s in result[0].split(";") if re.search("\[", s)]
            self.sumerian_lemmas += lemmas
        return self.sumerian_lemmas
    
    # Find the most likely label
    def set_label(self):
        def find_label(label, line, found) :
            for val in labels[label]:
                if val in line: 
                    if label in found.keys():
                        found[label].append(line)
                    else:
                        found[label] = [line]
                    return True
        found = {}
        for line in self.lines:
            label = None
            # Priority 1: Check for dead animal
            if find_label("dead_animal", line, found): continue
            # Priority 2: Check for wild animal
            if find_label("wild_animal", line, found): continue
            # Priority 3: Check for domesticated animal
            if find_label("domesticated_animal", line, found): continue
            # Priority 4: Check leather, wool, or precious object
            if find_label("leather_object", line, found): continue
            if find_label("precious_object", line, found): continue
            if find_label("wool", line, found): break
        # If none match, label as "Unknown"
        if len(found.keys()) == 0:
            found["Unknown"] = [self.lines]
        self.label = found
        return found
            
    
# Read ORACC files to find transactions with p_ids in `ids`
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            if line.startswith("P"):
                line = line.strip()
                lst.append(line)
    return lst

In [3]:
list_drehem_ids = get_drehem_ids(DREHEM_IDS)
list_queen_ids = get_drehem_ids(QUEEN_ARCHIVES_IDS)
list_oip_queen_ids = get_drehem_ids(QUEEN_OIP_IDS)
#complete_list = list_drehem_ids + list_queen_ids + list_oip_queen_ids
# list_more_data = get_drehem_ids("more_training_data.txt")
# more_training_data = read_files("raw-data/", list_more_data)
# text = []
# with open('more_training_data2.txt', 'w', encoding="utf8") as f:
#     for item in more_training_data:
#         f.write(item.p_id+"\n")
#         item.get_lemmatization()
#         for i in item.lemmas.keys():
#             f.write(i+"\n")
#         f.write("\n")
#all_transactions = read_files("raw-data/", complete_list)
non_queen_list = read_files("raw-data/", list_drehem_ids)
queen_training_list = read_files("raw-data/", list_queen_ids)
queen_test_set = read_files("raw-data/", list_oip_queen_ids)
#more_training_data = read_files("raw-data/", complete_list, True)

Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 429
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 275
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf

In [7]:
# Populate training and data set

for item in queen_training_list:
    item.get_lemmatization()
    item.set_label()
    
for item in non_queen_list:
    item.get_lemmatization()
    item.set_label()
    
for item in queen_test_set:
    item.get_lemmatization()
    item.set_label()
    
            
training_data = []
training_labels = []
test_data = []
test_labels = []

for item in queen_training_list[:175]:
    training_data.append(" ".join(item.get_sumerian_lemma()))
    training_labels.append("queen")
    
for i in range(len(non_queen_list)):
    if i < 350:
        training_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        training_labels.append("not queen")
    else:
        test_data.append(" ".join(non_queen_list[i].get_sumerian_lemma()))
        test_labels.append("not queen")
        
for item in queen_test_set:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")
    
for item in queen_training_list[175:]:
    test_data.append(" ".join(item.get_sumerian_lemma()))
    test_labels.append("queen")

print(len(training_data))
print(len(training_labels))
print(len(test_data))
print(len(test_labels))

#print(training_data)
# print(test_labels)

525
525
299
299


# Multinomial Naive Bayes Classifer
For classifying queen's archives transactions


<b>Accuracy</b>: 
(# true positives + # true negatives) / total #<br><br>
<b>Recall</b>:
true positives / (true positives + false positives) <br>
High recall means that an algorithm returned most of the relevant results <br><br>
<b>Precision</b>:
true positives / (true positives + false negatives) <br>
High precision means that an algorithm returned substantially more relevant results than irrelevant ones

In [11]:
# Bag of Words model
count_vect = CountVectorizer(analyzer = "word",
                                          tokenizer = None,    
                                          preprocessor = None,
                                          ngram_range = (1, 1),
                                          binary = False,
                                          strip_accents='unicode')

print(training_data[0:5])

X_train_counts = count_vect.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)
print(X_train_counts)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)
# print(X_train_tfidf)

# Classifier
bag_of_words_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = count_vect.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = bag_of_words_classifier.predict(X_new_tfidf)

# for doc, category in zip(docs_new, predicted):
#     print('%r => %s' % (doc, category))
# print(predicted)
    
print("Accuracy: ", np.mean(predicted == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, predicted, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, predicted, ["queen", "not queen"], average="macro")))

['udunita u kir ur itud uzud kir ur udunita mašgal itud akiti sadug u mu e du', 'gud udu u mu.DU kurušda dab itud mu us hulu', 'sila maš sila mu.DU dab itud mu hulu', 'udu maš mu.DU dab itud mu hulu', 'udu niga sadug kag eš udu niga sadug kag ŋipar udu u itud ud zal ziga šag itud mu us hulu']
(525, 365)
  (0, 54)	1
  (0, 209)	1
  (0, 254)	1
  (0, 7)	1
  (0, 204)	1
  (0, 339)	1
  (0, 146)	2
  (0, 326)	2
  (0, 161)	2
  (0, 316)	2
  (1, 127)	1
  (1, 330)	1
  (1, 40)	1
  (1, 181)	1
  (1, 315)	1
  (1, 111)	1
  (1, 54)	1
  (1, 209)	2
  (1, 146)	1
  (2, 200)	1
  (2, 277)	2
  (2, 127)	1
  (2, 40)	1
  (2, 54)	1
  (2, 209)	2
  :	:
  (523, 155)	4
  (523, 313)	4
  (523, 226)	12
  (523, 40)	4
  (523, 315)	4
  (523, 209)	4
  (523, 204)	4
  (523, 146)	4
  (524, 305)	4
  (524, 199)	4
  (524, 98)	4
  (524, 74)	4
  (524, 286)	4
  (524, 187)	20
  (524, 131)	4
  (524, 75)	4
  (524, 289)	4
  (524, 363)	4
  (524, 205)	4
  (524, 155)	4
  (524, 255)	4
  (524, 313)	4
  (524, 315)	28
  (524, 209)	8
  (524, 146)

In [16]:
# Bigram Model
bigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (2, 2),
                                    strip_accents='unicode')

# Train
X_train_counts = bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
bigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

bigram_multinomial_nb_prediction = bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(bigram_multinomial_nb_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, bigram_multinomial_nb_prediction, ["queen", "not queen"], average="macro")))

(475, 2049)
(475, 2049)
Accuracy:  0.813753581662
Recall:  0.85807860262
Precision:  0.824324324324


In [18]:
# Trigram Model
trigram_vectorizer = CountVectorizer(analyzer = "word",
                                    tokenizer = None,
                                    preprocessor = None,
                                    ngram_range = (3, 3),
                                    strip_accents='unicode')

# Train
X_train_counts = trigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
trigram_classifier = MultinomialNB().fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = trigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

trigram_prediction = trigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(trigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, trigram_prediction, ["queen", "not queen"], average="macro")))

(475, 3829)
(475, 3829)
Accuracy:  0.776504297994
Recall:  0.827711062591
Precision:  0.800303454715


In [78]:
# Unigram and Bigram Model
uni_and_bigram_vectorizer = CountVectorizer(analyzer = "word",
                                            tokenizer = None,
                                            preprocessor = None,
                                            binary = False,
                                            ngram_range = (1,2),
                                            strip_accents='unicode')

# Train
X_train_counts = uni_and_bigram_vectorizer.fit_transform(training_data) # Learn the vocabulary dictionary and return term-document matrix.
print(X_train_counts.shape)

# Get TF-IDF
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
print(X_train_tfidf.shape)

# Classifier
uni_and_bigram_classifier = MultinomialNB(0.5).fit(X_train_tfidf, training_labels)

# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(test_data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

print("Accuracy: ", np.mean(uni_and_bigram_prediction == test_labels))
print("Recall: ", str(metrics.recall_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))
print("Precision: ", str(metrics.precision_score(test_labels, uni_and_bigram_prediction, ["queen", "not queen"], average="macro")))

(525, 2739)
(525, 2739)
Accuracy:  0.90635451505
Recall:  0.928250863061
Precision:  0.868684366951


# Clustering

GOAL: find more commodity labels in the set of non-queen data

DBSCAN Model:
Density-Based Spatial Clustering of Applications with Noise. Finds core samples of high density and expands clusters from them. Good for data which contains clusters of similar density

In [15]:
# Build DBSCAN model with Tf-idf vectorizer
tfidfvec = TfidfVectorizer(ngram_range=(1,2), min_df = 0.0, max_df = 1.0, decode_error = "ignore")

# Run DBSCAN
trans_list = []
trans_dict = {}

for trans in non_queen_list:
    trans_dict[" ".join([lemma[2:] for lemma in trans.lemmas])] = trans
    trans_list.append(" ".join([lemma[2:] for lemma in trans.lemmas]))
    
X1 = tfidfvec.fit_transform(trans_list).toarray()
# (1,2),1.0 = 9, (1,2),1.1 = 6, (1,3),1.2 = 7
db1 = DBSCAN(eps=1.0, min_samples=len(trans_list)/100).fit(X1)  # Higher eps => More leniency to be same cluster
core_samples_mask = np.zeros_like(db1.labels_, dtype=bool)
core_samples_mask[db1.core_sample_indices_] = True

labels1 = db1.labels_
n_clusters_ = len(set(labels1)) - (1 if -1 in labels1 else 0) # Number of clusters in labels
print('Estimated number of clusters: %d' % n_clusters_)

Estimated number of clusters: 10


In [28]:
# Print clusters
clusters = {}
for c, i in enumerate(labels1):
    if i == -1:
        continue
    elif i in clusters:
        clusters[i].append(trans_list[c] )
    else:
        clusters[i] = [trans_list[c]]

i= 0
for c in clusters:
    print("Cluster", i)
    print("=========","\n")
    for trans in clusters[c]:
        print(trans_dict[trans].p_id)
        print(trans)
    print()
    i += 1
    
# Brief analysis: number of clusters depends on eps and ngram range

Cluster 0

P100041
 6(diš) udu  kišib₃ lu₂-{d}suen  ki ab-ba-kal-la-ta  ba-zi#  {d}šu-{d}suen  lugal kal-ga  lugal uri₅{ki}-ma  lugal an ub-da limmu₂-ba  ur-ku₃-nun-na#  dub-sar#  dumu [...]  ARAD₂-[zu]
P100189
 2(diš) udu niga  1(diš) sila₄ ga  ba-uš₂  u₄ 5(diš)-kam  ki lu₂-dingir-ra-ta  ur-nigar{gar}  šu ba-ti  iti šu-eš₅-ša  mu ki-maš{ki} u₃ hu-ur₅-ti{ki} ba-hul
P100190
 1(diš) sila₄  1(diš) sila₄ {d}nin-lil₂  mu-kuₓ(DU) nu-i₃-da  zabar-dab₅ maškim  5(diš) u₈ 2(diš) udu 1(diš) maš₂  ba-uš₂ e₂-kišib₃-ba-še₃  [u₄] 3(u) [la₂ 1(diš)?]-kam  ki na-sa₆-ta [ba]-zi  iti ezem-{d}šul-gi  mu us₂-sa ki-maš{ki} ba-hul
P100191
 1(diš) maš₂-gal niga 4(diš)? udu  ba-uš₂  u₄ 2(u) 4(diš)-kam  ša₃ unu{ki}-ga  ki lu₂-dingir-ra-ta  ur-nigar{gar}  šu ba-ti  iti u₅-bi₂-gu₇  mu# [{d}]amar#-{d}suen lugal
P100211
 1(diš) udu a? x saga?  1(diš) udu niga  1(diš) maš₂-gal niga  1(diš) udu  2(diš) sila₄ ga  1(diš) kir₁₁ ga  ba-uš₂  u₄ 1(u) 1(diš)-kam  ki lu₂-dingir-ra-ta  ur-nigar{gar}  šu ba-ti  iti še-sag₁₁-ku₅

# Predict

In [19]:
# Read all files to get all Drehem transactions
def read_files(subdir, ids, reverse=False):
    transactions = list()
    for i in range(1, 16):
        file_name = ""
        if i < 10:
            file_name += subdir + "p00" + str(i) + ".atf"
        else:
            file_name += subdir + "p0" + str(i) + ".atf"
        
        curr_transaction = None
        
        with open(file_name, encoding="utf8") as file:
            print("Opening:", file_name)
            for line in file:
                line = line.strip()
                if line.startswith('&P'):
                    p_id = line.split()[0][1:]
                    #print(p_id)
                    if (not reverse and p_id in ids):
                        ids.remove(p_id)
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    elif (reverse and p_id not in ids and len(transactions) <= 200):
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        transaction = Transaction(p_id)
                        curr_transaction = transaction
                    else:
                        if curr_transaction:
                            transactions.append(curr_transaction)
                        curr_transaction = None
                else:
                    if curr_transaction:
                        curr_transaction.lines.append(line)
        
        if curr_transaction:
            transactions.append(curr_transaction)
    
    #print(ids)
    #assert len(ids) == 0
    print("Number of transactions:", len(transactions))
    return transactions

# Return the IDs of docs to annotate
def get_drehem_ids(file):
    lst = list()
    with open(file, encoding="utf8") as f:
        for line in f:
            line = line.strip()
            lst.append("P" + line)
    return lst

In [20]:
all_ids = get_drehem_ids("drehem_p_ids.txt")
print(all_ids[:10])
all_transactions = read_files("raw-data/", all_ids)

['P125693', 'P131063', 'P103742', 'P118642', 'P337724', 'P212008', 'P103154', 'P105823', 'P390986', 'P115492']
Opening: raw-data/p001.atf
Opening: raw-data/p002.atf
Opening: raw-data/p003.atf
Opening: raw-data/p004.atf
Opening: raw-data/p005.atf
Opening: raw-data/p006.atf
Opening: raw-data/p007.atf
Opening: raw-data/p008.atf
Opening: raw-data/p009.atf
Opening: raw-data/p010.atf
Opening: raw-data/p011.atf
Opening: raw-data/p012.atf
Opening: raw-data/p013.atf
Opening: raw-data/p014.atf
Opening: raw-data/p015.atf
Number of transactions: 14594


In [28]:
data = []
mapping = {}

for t in all_transactions:
    t.get_lemmatization()
    lemma = " ".join(t.get_sumerian_lemma())
    data.append(lemma)
    mapping[lemma] = t
    
# Predict
X_new_counts = uni_and_bigram_vectorizer.transform(data)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

uni_and_bigram_prediction = uni_and_bigram_classifier.predict(X_new_tfidf)

In [70]:
for doc, category in zip(data[:100], uni_and_bigram_prediction[:100]):
    print('%r => %s' % (doc, category))

'udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad udu kišib ki lugal kalag lugal lugal an anubda limmu dubsar dumu arad' => not queen
'udu niga sila ga uš ud ki šu teŋ itud mu u hulu udu niga sila ga uš ud ki šu teŋ itud mu u hulu udu niga sila ga uš ud ki šu teŋ itud mu u hulu udu niga sila ga uš ud ki šu teŋ itud mu u hulu' => not queen
'sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu sila sila mu.DU zabardab maškim u udu maš uš ekišibak ud lal ki itud mu us hulu' => not queen
'mašgal niga udu uš ud šag ki šu teŋ itud mu lugal mašgal niga udu uš ud šag ki šu teŋ itud mu lugal mašgal niga udu uš ud šag ki šu teŋ itud mu lugal mašgal niga 

In [29]:
# Percentange of Queen transactions
print(len([i for i in uni_and_bigram_prediction if i == "queen"])/len(uni_and_bigram_prediction))

0.029532684664930794


In [73]:
queens = [data[i] for i in range(len(uni_and_bigram_prediction)) if uni_and_bigram_prediction[i] == "queen"]

with open("predicted_queen.txt", 'w', encoding="utf8") as f:
    for q in queens:
        qu = mapping[q]
        f.write(qu.p_id+"\n")
        for line in qu.lemmas.keys():
            f.write(line+"\n")
        f.write("\n")

# Support Vector Machine
Good for classification and when you have small datasets (<1000 points)

In [79]:
from sklearn.linear_model import SGDClassifier

text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42))])

text_clf.fit(training_data, training_labels)
predicted = text_clf.predict(test_data)
np.mean(predicted == test_labels)

0.90301003344481601

In [80]:
predict_v2 = text_clf.predict(data)

In [81]:
print(len([i for i in predict_v2 if i == 'queen'])/len(predict_v2))
print(predict_v2[:10])
#print(all_labels)

0.07503083458955735
['not queen' 'not queen' 'not queen' 'not queen' 'not queen' 'not queen'
 'not queen' 'not queen' 'not queen' 'not queen']
