In [3]:
import os, re

import pandas as pd
from pandas import DataFrame, Series

In [4]:
#set home directory path
hdir = os.path.expanduser('~')

#pahlavi corpus directory
pah_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pahlavi_corpus/"

#pickle path
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [5]:
df_pahcorp = pd.read_csv (os.path.join(pickle_path,r'pahlavi_corpus.csv'))

In [21]:
df_pahcorp.sample(5)

Unnamed: 0,title,line,index,token
319335,PT 04,110,18,hazār
43765,Dēnkard10,3.112.8,16,abzār
53381,Dēnkard10,3.154.4,6,ayāsag
198776,Wizirgerd ī Dēnīg,11.5,7,ud
376762,BUNDAHIŠN,27.98,3,kār


In [33]:
tok = 'hazār'
sel = df_pahcorp[df_pahcorp['token']==tok].copy()
sel['index_next'] = sel['index'] + 1
sel = sel.join(
    df_pahcorp.set_index(['title', 'line', 'index'])['token'].rename('token_next'),
    on=['title', 'line', 'index_next']
)
sel.groupby('title')['token_next'].value_counts()
# simpler: sel['token_next'].value_counts()


title                              token_next
ARDĀ WIRĀZ                       wāz           1
BUNDAHIŠN                         100           1
                                   sardag        1
                                   srūw          1
                                   sāl           1
Dādestān ī Mēnōy Xrad         sāl           1
Dēnkard 1                         (?)           1
                                   mard          1
Dēnkard 3                         tā            1
                                   ēk-iz-ēw      1
Dēnkard 6                         gām           1
Dēnkard 8                         būd           1
Kār                               ārāst         1
Mādayān ī hazār dādestān     dinār         1
                                   dādestān      1
Mādayān ī hazār dādestān II  dādestān      1
PT 02                              ud            1
PT 04                              drahm         1
                                   k

----

## Key Word in Context

In [7]:
def index_kwic (term):
    
    """This function returns a dataframe filtered by the search term."""
    
    result = df_pahcorp[df_pahcorp['token'].str.match(term)]
    return result
    
    # str.match; the str part is telling match how to behave; .match is a method specific to pandas
    
    
# add regex functionality: 

In [8]:
def kwic_pah (term):
    
    for i, item in index_kwic(term).iterrows():
        
        title = item["title"]
        line = item["line"]
        
        filtered = df_pahcorp[(df_pahcorp['title']==title)&(df_pahcorp['line']==line)]
        # equivalent syntax: df_pahcorp.query(f'title == "{title}" and line == {line}')
        
        filtered = filtered.sort_values("index")
        # probably already sorted, but better to be on the safe side
                
        text = " ".join(filtered["token"])
        
        print(f'{title}: {line}\n{text}\n')
        
        # task: figure out how to color code results; termcolor package, has to be installed



        
# iterrows(): research what this does exactly, has something to do with dataframes being composed of series
        

In [9]:
#df_pahcorp["token"] == "afsōn"

In [37]:
#kwic_pah ("hām.*$")

In [11]:
# Conditional frequency

# two problems: (1) find where the words are; (2) find what words are next to a particular location

# collect on i-1 i+1; counter is a function in the collections module that takes a list and turns it into a frequency dictionary
## A column is technically a series, which can work like a list.

In [12]:
indexed = list(index_kwic('hāmōn').index)

cd_index = []
length = len(indexed)
for i in range(length):
    cd_index.append((indexed[i]-1, indexed[i], indexed[i]+1))
    
    
    
# now need to use this (and counter module?) to count frequency of the terms immediately before and after, i.e. positions 0 and 2 in the tuple

In [13]:
cd_index

[(233989, 233990, 233991),
 (240938, 240939, 240940),
 (430296, 430297, 430298),
 (435883, 435884, 435885),
 (510282, 510283, 510284),
 (529918, 529919, 529920)]

In [14]:
#for x in cd_index:
#    print (cd_index[x][1])

In [15]:
cd_index[1][1]

240939

In [16]:
df_pahcorp.iloc[240938]

title    Pahlavi Rivāyat
line                31c.7
index                  18
token                  ka
Name: 240938, dtype: object

## Frequency

In [17]:
freq_dic = pd.value_counts(df_pahcorp.token).to_frame().reset_index()

In [18]:
freq_dic.sample(5)

Unnamed: 0,index,token
41966,yezī,1
12933,zanīhist,2
9685,weh-gōwišnīh,2
11635,dahēd},2
11392,gandīh,2


In [19]:
search_term = re.compile(r"sar.y")

In [20]:
query_mask = freq_dic["index"].str.contains(search_term, na=False)
query = freq_dic[query_mask]
query.head()

# turn into a function, just return top hits

Unnamed: 0,index,token
7461,čang-sarāy,3
10421,win-sarāy,2
15553,nāy-sarāy,1
30487,(sarāyspar-wāzīg,1
31068,sarāy,1
