In [7]:
import os, re

import pandas as pd
from pandas import DataFrame, Series

In [8]:
#set home directory path
hdir = os.path.expanduser('~')

#pahlavi corpus directory
pah_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pahlavi_corpus/"

#pickle path
pickle_path = hdir + "/Dropbox/Active_Directories/Digital_Humanities/Corpora/pickled_tokenized_cleaned_corpora"

In [9]:
df_pahcorp = pd.read_csv (os.path.join(pickle_path,r'pahlavi_corpus.csv'))

In [10]:
df_pahcorp.sample(5)

Unnamed: 0,title,line,index,token
327458,PT12,49,8,ud
40433,Dēnkard10,3.95.3,43,nē
131849,Zādspram,32.1,3,handāzag
229397,Pahlavi Rivāyat,8g.1,43,xwēš
121694,Zādspram,murw ī andarwāy-wāzišn [D7 452.11,12,ī


----

## Key Word in Context

In [11]:
def index_kwic (term):
    
    """This function returns a dataframe filtered by the search term."""
    
    result = df_pahcorp[df_pahcorp['token'].str.match(term)]
    return result
    
    # str.match; the str part is telling match how to behave; .match is a method specific to pandas
    
    
# add regex functionality: 

In [12]:
def kwic_pah (term):
    
    for i, item in index_kwic(term).iterrows():
        
        title = item["title"]
        line = item["line"]
        
        filtered = df_pahcorp[(df_pahcorp['title']==title)&(df_pahcorp['line']==line)]
        # equivalent syntax: df_pahcorp.query(f'title == "{title}" and line == {line}')
        
        filtered = filtered.sort_values("index")
        # probably already sorted, but better to be on the safe side
                
        text = " ".join(filtered["token"])
        
        print(f'{title}: {line}\n{text}\n')
        
        # task: figure out how to color code results; termcolor package, has to be installed



        
# iterrows(): research what this does exactly, has something to do with dataframes being composed of series
        

In [13]:
#df_pahcorp["token"] == "afsōn"

In [14]:
#kwic_pah ("hām.*$")

In [15]:
# Conditional frequency

# two problems: (1) find where the words are; (2) find what words are next to a particular location

# collect on i-1 i+1; counter is a function in the collections module that takes a list and turns it into a frequency dictionary
## A column is technically a series, which can work like a list.

In [16]:
indexed = list(index_kwic('hāmōn').index)

cd_index = []
length = len(indexed)
for i in range(length):
    cd_index.append((indexed[i]-1, indexed[i], indexed[i]+1))
    
    
    
# now need to use this (and counter module?) to count frequency of the terms immediately before and after, i.e. positions 0 and 2 in the tuple

In [17]:
cd_index

[(233989, 233990, 233991),
 (240938, 240939, 240940),
 (430296, 430297, 430298),
 (435883, 435884, 435885),
 (510282, 510283, 510284),
 (529918, 529919, 529920)]

In [18]:
#for x in cd_index:
#    print (cd_index[x][1])

In [19]:
cd_index[1][1]

240939

In [20]:
df_pahcorp.iloc[240938]

title    Pahlavi Rivāyat
line                31c.7
index                  18
token                  ka
Name: 240938, dtype: object

## Frequency

In [21]:
freq_dic = pd.value_counts(df_pahcorp.token).to_frame().reset_index()

In [22]:
freq_dic.sample(5)

Unnamed: 0,index,token
38505,[š́ātō.manå,1
33931,[M183],1
11820,[M763],2
35907,anāg-kāmīh,1
5158,guwāgīh,5


In [23]:
search_term = re.compile(r"sar.y")

In [24]:
query_mask = freq_dic["index"].str.contains(search_term, na=False)
query = freq_dic[query_mask]
query.head()

# turn into a function, just return top hits

Unnamed: 0,index,token
8058,čang-sarāy,3
12076,win-sarāy,2
14998,(sarāyspar-wāzīg,1
15080,kennār-sarāy,1
15084,sarāy,1


## Conditional Frequency

In [25]:
def confreq (term, group=False):
    sel = df_pahcorp[df_pahcorp['token']==term].copy()
    sel['index_next'] = sel['index'] + 1
    sel = sel.join(
        df_pahcorp.set_index(['title', 'line', 'index'])['token'].rename('token_next'),
        on=['title', 'line', 'index_next']
    )
    # If there are only 1-frequency results, it will still show them;
    # but if there are enough higher frequency results, it will omit the 1-frequency results.
    result = sel['token_next'].value_counts()
    short_result = [(x,y) for x,y in result.items() if y > 1]
    if len(short_result) > 5:
        result = short_result
    # improvement: create a list of omitted words (e.g. ud, ī, etc.), and make a flag1=False
    # optional argument to omit them.
    
    if group == True:
        result = sel.groupby('title')['token_next'].value_counts()
    
    return (result)
    

In [26]:
confreq ('pid', True)

title                 token_next
ARDĀ WIRĀZ          ud            3
BUNDAHIŠN            ud            3
                      ī             2
                      bē            1
                      ēn            1
                                   ..
Zand_i_Fragard_i_Jud  xwēš          1
Zādspram             ud            4
                      abar          2
                      pus           1
                      wēnēd         1
Name: token_next, Length: 228, dtype: int64

In [27]:
tok = 'hazār'
sel = df_pahcorp[df_pahcorp['token']==tok].copy()
sel['index_next'] = sel['index'] + 1
sel = sel.join(
    df_pahcorp.set_index(['title', 'line', 'index'])['token'].rename('token_next'),
    on=['title', 'line', 'index_next']
)

# simpler: sel['token_next'].value_counts()
