In [2]:
#block 1st
#export stem-word list
#Resources
# Description: 30,862 stems and their 200-dimensional frequencies (ordered list of the 200 dimensions is available at http://st2.zargan.com/public/resources/turkish/top_200_suffix_sequences_attached_to_nouns.zip
# Column 1: stem, Columns 2-201: frequency of the relevant morphological form
# Delimiter: tab, Encoding: UTF-8
# Source: BOUN Corpus (Sak, H., Güngör, T., & Saraçlar, M. (2008). Turkish language resources: Morphological parser, morphological disambiguator and web corpus. In Advances in natural language processing (pp. 417-427). Springer Berlin Heidelberg.)
# Copyright: Zargan Ltd.

import pandas as pd
#word_forms_stems_and_frequencies_full
data = pd.read_csv('corpus.txt', sep="\t", header=None)
data.columns=["derivates","stems","structure","frequency"]
data

Unnamed: 0,derivates,stems,structure,frequency
0,çöğürü,çöğür,Noun+A3sg+P3sg+Nom,1
1,çöğüründen,çöğür,Noun+A3sg+P3sg+Abl,2
2,çöğür,çöğür,Noun+A3sg+Pnon+Nom,23
3,çöğürcü,çöğürcü,Noun+A3sg+Pnon+Nom,3
4,çöğürden,çöğür,Noun+A3sg+Pnon+Abl,1
...,...,...,...,...
1337893,zurnaya,zurna,Noun+A3sg+Pnon+Dat,23
1337894,zurnayla,zurna,Noun+A3sg+Pnon+Ins,160
1337895,zurnazen,zurnazen,Noun+A3sg+Pnon+Nom,4
1337896,zurnazenler,zurnazen,Noun+A3pl+Pnon+Nom,2


In [5]:
#block 2nd
#extract stems from the data
stems_0=data["stems"].unique().tolist()
#number of stem list
print("stems_0",len(stems_0))
#stems_0 39843 stems

stems_0 39843


In [6]:
#block 3rd
#method to show whether a word in stem list, or not
def isin_stem_list(word,stem_list):
    for stem in stem_list:
        if stem==word:
            return True
    
    return False


In [30]:
#block 4th
print("stems_0",len(stems_0))
#cleaning the stem_list

# Turkish language is an agglutinative language. So many "derived forms" can be created by using one stem and many suffixes.
#These "derived forms" are semantically related to each other. 
#So the main goal for this procedure is to find the stem semantically involving more words.
#This issue is discussed in  2nd question  in 1stStep_turkish_stems_ReadMe.txt


stems_1=[]
#Cleaning 1st step
#remove one and two letters-stems (due to large size of semantic scope and thereby causing biased prediction)  

for stem in stems_0:
    if len(stem)>2:
        stems_1.append(stem)
        
        
#re-adding an exception ev (home)
stems_1.append("ev")
stems_1.remove("evlek")
stems_1.remove("ypk")

print("stems_1",len(stems_1))

#Alternatively
#remove two and three-letters-stems (due to large size of semantic scope and thereby causing biased prediction)
#stems_1
#for stem in stems_0:
    #if len(stem)>3:
        #stems_2.append(stem)
        
#Cleaning 2nd step
#remove some "derived form" from stem list

stems_2=[]

for stem in stems_1:
    
    #for two letters suffixes
    
    #suffix -çi (and other variants -çı,-çu,-çü) like -er suffix in english
    #suffix -ev (-home) (ex: dikim (sewing-tailoring) +ev(i) (-home)= dikimevi (sewing-tailoring workshop))
    #suffix -li (and other variants -lı,-lu,-lü), like -ian or -er suffix in english (ex: dublin-li (dublin-er))
    #suffix -la (and other variant -le) like bomba (bomb) and bomba-la-mak (to bomb)
    #p21, p22,p3 and q2 boolean expressions for two-letters suffixes
    p21=(stem.endswith("çı") or stem.endswith("çi") or stem.endswith("çu") or stem.endswith("çü") or stem.endswith("ev") )
    p22=(stem.endswith("lı") or stem.endswith("li") or stem.endswith("lu") or stem.endswith("lü"))
    p23=(stem.endswith("la") or stem.endswith("le") )
    
    #stem without suffix -çi,-çı,-çu,-çü,-li,-lı,-lu, -lü, -la and -le  is meaningful namely is it in stem list ?    
    q2=isin_stem_list(stem[:-2],stems_1)
    
    
    #for three letters suffixes
    
    #suffix -lik (and other variants -lık,-luk,-lük) like -dom,-ness suffix (ex: kral (king) + -dom(-lık)=krallık (kingdom))
    
    p31=(stem.endswith("lik") or stem.endswith("lık") or stem.endswith("lük") or stem.endswith("luk"))
    #suffix -len (and other variant -lan) making noun verb form like buz (ice) and buzlan-mak (to frost)
    p32=(stem.endswith("len") or stem.endswith("lan"))
    
    #suffix -siz (and other variant -sız,-suz and -süz) -less in english ex: tat-sız (taste-less)
    p33=(stem.endswith("siz") or stem.endswith("sız") or stem.endswith("suz") or stem.endswith("süz"))
    
    
    #stem without suffix -lik,-lık,-luk,-lük,-lan,-len is meaningful namely is it in stem list ?  
    q3=isin_stem_list(stem[:-3],stems_1)
    
    #for five letters suffixes
    
    #combination suffix -çi+-lik=-çilik (and other variants -çılık,-çuluk,-çülük)
    p51=(stem.endswith("çilik") or stem.endswith("çılık") or stem.endswith("çülük") or stem.endswith("çuluk"))
    #combination suffix -ci+-lik=-çilik (and other variants -cılık,-culuk,-cülük)
    p52=(stem.endswith("cilik") or stem.endswith("cılık") or stem.endswith("cülük") or stem.endswith("culuk"))
    #stem without suffix -lik,-lık,-luk,-lük,-lan,-len is meaningful namely is it in stem list ?  
    q5=isin_stem_list(stem[:-5],stems_1)
    
    #combination suffix -siz+-lik=-sizlik (and other variants -sızlık,-suzluk,-süzlük)
    #-siz and -lik  (-less and -ness in english respectively), ex: akıl-sız-lık (wit-less-ness)
    p6=(stem.endswith("sizlik") or stem.endswith("sızlık") or stem.endswith("suzluk") or stem.endswith("süzlük"))
    #stem without suffix -lik,-lık,-luk,-lük,-lan,-len is meaningful namely is it in stem list ?  
    q6=isin_stem_list(stem[:-6],stems_1)
    
    #derived form detection
    p=(p21 or p22 or p23 or p31 or p32 or p33 or p51 or p52 or p6) 
    
    #stem without suffixes mentioned above is meaningful ?         
    q=(q2 or q3 or q5 or q6)
    
    #abbrevations including "."  must be excluded
    r=stem.find(".") !=-1
    
    
    if not((p and q) or r ) :
        stems_2.append(stem)
      
      
#re-adding exceptions
stems_2.append("ölçü") 
stems_2.append("cemev")
stems_2.append("pertev")
stems_2.append("aşev")
stems_2.append("genelev")
stems_2.append("türev")
stems_2.append("huzurev") 
stems_2.append("gözlük")
stems_2.append("evlen")
stems_2.append("evli")

#excluding some exceptions
#stems_2.remove("simitis")
#stems_2.remove("kalemis")
#stems_2.remove("evç")



print("stems_2",len(stems_2))

print(isin_stem_list("simitçilik",stems_2))



        

        
        
        
        
        
      

stems_0 39843
stems_1 39641
stems_2 31838
False


In [7]:
#block 5th
#test of cleaned stems
print(isin_stem_list("ölçü",stems_2))
print(isin_stem_list("simitçi",stems_1))
print(isin_stem_list("simitçilik",stems_2))
print(isin_stem_list("simitçi",stems_2))
print(isin_stem_list("simit",stems_2))
print(isin_stem_list("simitçilik",stems_2))

True
True
False
False
True
False


In [5]:
#block 5th

#save stemps_2
cleaned_stems=stems_2
import pickle
with open("cleaned_stems.txt", "wb") as fp:
       pickle.dump(cleaned_stems, fp)

#load stemps
#with open("cleaned_stems.txt", "rb") as fp:
    #cleaned_stems = pickle.load(fp)

In [6]:
#block 6th

#for tests
# method to get stem from stem_list with maximum length involved by the word
def get_max_stem_core(word,stem_list):
    max_stem=word
    #flag indicating whether stem, having more than one letters and maximum length, from cleaned stem list
    #if flag=0, not found in stem_list, if flag=1, found in stem_list
    flag=0
    i=2
    for stem in stem_list:
        if (word[0:i]==stem[0:i] and word.find(stem) !=-1):
        #if (word[0:i]==stem[0:i]) and len(stem)<len(word)):
            max_stem=stem
            flag=1
            i=i+1
        
            
            
   
        
  
          
    return [max_stem,flag]

#method to reconsruct derived form in "Final-obstruent devoicing", mentioned 3rd question  in 1stStep_turkish_stems_ReadMe.txt
#by controlling  5 cases shown in letter list in the method by accessing index i
def reconstruct(word,i):
    original=["p","ç","t","k","k"]
    not_original=["b","c","d","g","ğ"]
    if (word.find(not_original[i])!=-1):
        reversed_word=word[::-1]
        k=reversed_word.index(not_original[i])
        n=len(word)-k-1
        reconstructed_list=list(word)
        reconstructed_list[n]=original[i]
        reconstructed_word="".join(reconstructed_list)                
    else:
        reconstructed_word=word  
    
    
        
    return reconstructed_word
    





#method to get stem with maximum length with using reconstructed derived form the word in case "Final-obstruent devoicing"
def get_max_stem(word,stem_list):
    result=get_max_stem_core(word,stem_list)
    max_stem=result[0]
    flag=result[1]
    if flag==1:
        #obivously no need reconstruction for the word
        return max_stem
    else:
        i=0
        #reconstruct word until find convenient stem
        while(i<5 and flag==0):
            reconst_word=reconstruct(word,i)
            temp_result=get_max_stem_core(reconst_word,stem_list)
            #whether found, or not ?
            flag=temp_result[1]
            i=i+1
        if flag==1:
            #max_stem of reconstructed word is found in stem_list
            max_stem=temp_result[0]
            
        else:
            # in spite of reconstructed forms of word, covenient stem could not be found
            max_stem=word
            
            
            
        
        
            
            
   
        
  
          
    return max_stem




    

In [8]:
#block 7th

#test get_max_stems
print(get_max_stem("galatasaraylılar",cleaned_stems))
print(get_max_stem("simitçiydim",cleaned_stems))
print(get_max_stem("ölçümüz",cleaned_stems))
print(get_max_stem("sultanahmetteyim",cleaned_stems))
print(get_max_stem("jübileniz",cleaned_stems))
print(get_max_stem("köpeğim",cleaned_stems))
print(get_max_stem("kitabım",cleaned_stems))
print(get_max_stem("bombaladılar",cleaned_stems))
print(get_max_stem("evliyiz",cleaned_stems))
print(get_max_stem("mgönsmad m",cleaned_stems))
print(get_max_stem("umudun",cleaned_stems))
print(get_max_stem("ağaç",cleaned_stems))
print(get_max_stem("örneğin",cleaned_stems))
print(get_max_stem("şimşeğin",cleaned_stems))
print(get_max_stem("umumiyetle",cleaned_stems))
print(get_max_stem("umumiyetle",cleaned_stems))
print(get_max_stem("kitapsızlar",cleaned_stems))
print(get_max_stem("yıllık",cleaned_stems))
print(get_max_stem("düşerek",cleaned_stems))



galatasaray
simit
ölçüm
sultanahmet
jübile
köpek
kit
bomba
evli
mgönsmad m
umu
ağa
örneğin
şimşek
umumiyet
umumiyet
kitap
yıl
kitap


In [36]:
#block 8th
#application for understanding suffixes

#example 1
#process for exclude stems ending with -la
i=0
for stem in stems_0:
    #stem.endswith("la") : stem ends with -la ?
    #isin_stem_list(stem[:-2],stems_0): stem without -la is in stem_list, that is to stay, meaningful ?
    if stem.endswith("la") and isin_stem_list(stem[:-2],stems_0) :
        print(stem)
        i=i+1
      

    #number of these stems
print(i)

630


In [7]:
#block 9th
#example 2
i=0
for stem in stems_0:
    #stem.endswith("lik") : stem ends with -la ?
    #isin_stem_list(stem[:-3],stems_0): stem without -la is in stem_list, that is to stay, meaningful ?
    if stem.endswith("sız") and isin_stem_list(stem[:-3],stems_0) :
        print(stem)
        i=i+1
      
    #number of these stems        
print(i)
    


281


In [24]:
#block 10th
#example 3
#process for exclude stems ending with -çuluk
i=0
for stem in stems_0:
    #stem.endswith("çuluk") : stem ends with -la ?
    #isin_stem_list(stem[:-5],stems_0): stem without -la is in stem_list, that is to stay, meaningful ?
    if stem.endswith("çuluk") and isin_stem_list(stem[:-5],stems_0) :
        print(stem)
        i=i+1
      
    #number of these stems        
print(i)

çözümsüzlük
ödünsüzlük
ölçüsüzlük
ölçütsüzlük
ölümsüzlük
örgütsüzlük
alkolsüzlük
dönüşsüzlük
düşsüzlük
gönülsüzlük
görüşsüzlük
görgüsüzlük
güçsüzlük
gürültüsüzlük
golsüzlük
hükümsüzlük
hoşgörüsüzlük
köksüzlük
kültürsüzlük
pürüzsüzlük
süssüzlük
sütsüzlük
tahammülsüzlük
usulsüzlük
yönsüzlük
yüzsüzlük
26


In [26]:
#block 11th
#example 4
#process for exclude stems ending with -süzlük
i=0
for stem in stems_0:
    #stem.endswith("süzlük") : stem ends with -la ?
    #isin_stem_list(stem[:-6],stems_0): stem without -la is in stem_list, that is to stay, meaningful ?
    if stem.endswith("süzlük") and isin_stem_list(stem[:-6],stems_0) :
        print(stem)
        i=i+1
      
    #number of these stems        
print(i)

çözümsüzlük
ödünsüzlük
ölçüsüzlük
ölçütsüzlük
ölümsüzlük
örgütsüzlük
alkolsüzlük
dönüşsüzlük
düşsüzlük
gönülsüzlük
görüşsüzlük
görgüsüzlük
güçsüzlük
gürültüsüzlük
golsüzlük
hükümsüzlük
hoşgörüsüzlük
köksüzlük
kültürsüzlük
pürüzsüzlük
süssüzlük
sütsüzlük
tahammülsüzlük
usulsüzlük
yönsüzlük
yüzsüzlük
26
