In [1]:
import numpy as np
import pandas as pd
import re

from collections import Counter, OrderedDict
from functools import reduce

from sklearn.linear_model import LogisticRegression

In [2]:
labels = {}
labels['domesticated_animal'] = ['ox', 'cow', 'sheep', 'goat', 'lamb', '~sheep', 'equid'] # account for plural
labels['wild_animal'] = ['bear', 'gazelle', 'mountain'] # account for 'mountain animal' and plural
labels['dead_animal'] = ['die'] # find 'die' before finding domesticated or wild
labels['leather_object'] = ['boots', 'sandals']
labels['precious_object'] = ['copper', 'bronze', 'silver', 'gold']
labels['wool'] = ['wool', '~wool']
# labels['queens_archive'] = []

In [3]:
data = pd.read_csv('../bdtns_raw_data/filtered.csv', index_col=0)
data.loc[:, 'pn'] = data.loc[:, 'id_text'].str[-6:].astype(int)
data = data.set_index(['pn', 'id_line']).sort_index()
extracted = data.loc[:, 'lemma'].str.extract(r'(\S+)\[(.*)\](\S+)')
data = pd.concat([data, extracted], axis=1)
data = data.dropna()
data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lemma,id_text,id_word,label,0,1,2
pn,id_line,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
100041,3,6(diš)[]NU,epsd2/admin/u3adm/P100041,P100041.3.1,o 1,6(diš),,NU
100041,3,udu[sheep]N,epsd2/admin/u3adm/P100041,P100041.3.2,o 1,udu,sheep,N
100041,4,kišib[seal]N,epsd2/admin/u3adm/P100041,P100041.4.1,o 2,kišib,seal,N
100041,4,lu₂-{d}suen[]PN,epsd2/admin/u3adm/P100041,P100041.4.2,o 2,lu₂-{d}suen,,PN
100041,5,ki[place]N,epsd2/admin/u3adm/P100041,P100041.5.1,o 3,ki,place,N


In [4]:
for archive in labels.keys():
    data.loc[data.loc[:, 1].str.contains('|'.join([re.escape(x) for x in labels[archive]])), 'archive'] = archive

data.loc[:, 'archive'] = data.loc[:, 'archive'].fillna('')

data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,lemma,id_text,id_word,label,0,1,2,archive
pn,id_line,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
100041,3,6(diš)[]NU,epsd2/admin/u3adm/P100041,P100041.3.1,o 1,6(diš),,NU,
100041,3,udu[sheep]N,epsd2/admin/u3adm/P100041,P100041.3.2,o 1,udu,sheep,N,domesticated_animal
100041,4,kišib[seal]N,epsd2/admin/u3adm/P100041,P100041.4.1,o 2,kišib,seal,N,
100041,4,lu₂-{d}suen[]PN,epsd2/admin/u3adm/P100041,P100041.4.2,o 2,lu₂-{d}suen,,PN,
100041,5,ki[place]N,epsd2/admin/u3adm/P100041,P100041.5.1,o 3,ki,place,N,


In [5]:
def get_set(df):
    
    d = {}

    seals = df[df['label'].str.contains('seal')]
    df = df[~df['label'].str.contains('seal')]

    for x in df[2].unique():
        d[x] = set(df.loc[df[2] == x, 0])

    d['SEALS'] = {}
    for x in seals[2].unique():
        d['SEALS'][x] = set(seals.loc[seals[2] == x, 0])

    return d

In [6]:
get_set(data.loc[100041])

{'NU': {'6(diš)'},
 'N': {'ki', 'kišib', 'udu'},
 'PN': {'ab-ba-kal-la-ta', 'lu₂-{d}suen'},
 'V/i': {'zig'},
 'SEALS': {'TN': {'{d}šu-{d}suen'},
  'N': {'an', 'anubda', 'arad', 'dubsar', 'dumu', 'lugal'},
  'V/i': {'kalag'},
  'GN': {'uri₅{ki}-ma'},
  'NU': {'limmu'},
  'PN': {'ur-ku₃-nun-na'},
  'NA': {'x'}}}

In [7]:
archives = pd.DataFrame(data.groupby('pn').apply(lambda x: set(x['archive'].unique()) - set(['']))).rename(columns={0: 'archive'})
archives.loc[:, 'set'] = data.reset_index().groupby('pn').apply(get_set)
archives.loc[:, 'archive'] = archives.loc[:, 'archive'].apply(lambda x: {'dead_animal'} if 'dead_animal' in x else x)
archives.head()

Unnamed: 0_level_0,archive,set
pn,Unnamed: 1_level_1,Unnamed: 2_level_1
100041,{domesticated_animal},"{'NU': {'6(diš)'}, 'N': {'kišib', 'udu', 'ki'}..."
100189,{dead_animal},"{'NU': {'1(diš)', '2(diš)', '5(diš)-kam'}, 'N'..."
100190,{dead_animal},"{'NU': {'2(diš)', '3(u)', '5(diš)', '1(diš)', ..."
100191,{dead_animal},"{'NU': {'1(diš)', '2(u)', '4(diš)', '4(diš)-ka..."
100211,{dead_animal},"{'NU': {'1(diš)', '1(diš)-kam', '2(diš)', '1(u..."


In [8]:
def get_line(d, pos='N'):

    if 'N' in d:
        return pd.DataFrame({x: [1] for x in d[pos]})
    else:
        return pd.DataFrame()

In [9]:
sparse = pd.concat(archives.loc[:, 'set'].apply(get_line, pos='N').values, ignore_index=True)
sparse = sparse.fillna(0)
sparse = sparse.join(archives.loc[:, 'archive'].reset_index()).set_index('pn')
sparse.loc[sparse.loc[:, 'archive'].apply(lambda x: 'domesticated_animal' in x), 'domesticated_animal'] = 1
sparse.loc[:, 'domesticated_animal'] = sparse.loc[:, 'domesticated_animal'].fillna(0)
sparse.head()

Unnamed: 0_level_0,kišib,udu,ki,sila,ud,itud,ga,šu,mu,mu.DU,...,ekaskalak,usaŋ,siʾilla,egizid,gara,ŋa,enkud,in,archive,domesticated_animal
pn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100041,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{domesticated_animal},1.0
100189,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{dead_animal},0.0
100190,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{dead_animal},0.0
100191,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{dead_animal},0.0
100211,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,{dead_animal},0.0


In [10]:
known = sparse.loc[sparse['archive'].apply(len) == 1, :]
unknown = sparse.loc[(sparse['archive'].apply(len) == 0) | (sparse['archive'].apply(len) > 1), :]

In [11]:
clf = LogisticRegression(random_state=42, solver='lbfgs', max_iter=200)
clf.fit(known.loc[:, 'udu':'in'], known.loc[:, 'domesticated_animal'])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=200,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [12]:
clf.score(known.loc[:, 'udu':'in'], known.loc[:, 'domesticated_animal'])

0.7482644914960084

In [13]:
unknown.loc[:, 'domesticated_animal'] = pd.Series(clf.predict(unknown.loc[:, 'udu':'in']), index=unknown.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [14]:
archives.loc[100041, 'set']

{'NU': {'6(diš)'},
 'N': {'ki', 'kišib', 'udu'},
 'PN': {'ab-ba-kal-la-ta', 'lu₂-{d}suen'},
 'V/i': {'zig'},
 'SEALS': {'TN': {'{d}šu-{d}suen'},
  'N': {'an', 'anubda', 'arad', 'dubsar', 'dumu', 'lugal'},
  'V/i': {'kalag'},
  'GN': {'uri₅{ki}-ma'},
  'NU': {'limmu'},
  'PN': {'ur-ku₃-nun-na'},
  'NA': {'x'}}}