# Important !!
## Project Dependencies
**zemberek jar file is required**

**jpype library is required**

In [1]:
from jpype import JString, JClass, getDefaultJVMPath, shutdownJVM, startJVM, addClassPath, java
from typing import List
import jpype.imports
from jpype.types import *
import os
import string
import random
from random import randint

# CreateCorpus Class
This class consist of 6 methods:
## In Constructor
Zemberek package path is defined and jvm is started. Also **TurkishMorphology** class, **AnalysisFormatters** class and **WordAnalysis** class is taken from zemberek.
## read_filenames
This methods take path of news as an argument and return all files that parent folder consist of
## clean_text
This method split lines of text and clear text from punctuation
## read_files
Read all files returned from read_filenames
## analyze_word
In this method, distinguishes the stem of the given word and determine its type with the help of zemberek
## reproduce_noun and reproduce_verb
Generating new words from current stem.

In [2]:
class CreateCorpus(object):
    def __init__(self,zemberek_path):
        startJVM(
        getDefaultJVMPath(),
        '-ea',
        f'-Djava.class.path={zemberek_path}',
        convertStrings=False
    )
        self.TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
        self.AnalysisFormatters = JClass('zemberek.morphology.analysis.AnalysisFormatters')
        self.WordAnalysis: JClass = JClass('zemberek.morphology.analysis.WordAnalysis')
        self.morphology: self.TurkishMorphology = self.TurkishMorphology.createWithDefaults()
    
    def read_filenames(self,datapath):
        path = os.path.join(os.getcwd(),datapath)
        folders_path = [os.path.join(path,item) for item in os.listdir(path)]
        ret_list = list()
        for folder in folders_path:
            file = [os.path.join(folder,file) for file in os.listdir(folder) if file.endswith("txt")]
            for item in file:
                ret_list.append(item)
        return ret_list
    
    def clean_text(self,text):
        words = text.split()
        words = [word.lower() for word in words]
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in words]
        return stripped
    
    def read_files(self,filenames):
        ret_list = list()
        for item in filenames:
            file = open(item, 'rt',errors="ignore",encoding='utf-8-sig')
            text = file.read()
            file.close()
            ret_list += self.clean_text(text)
        return ret_list
    
    def analyze_word(self,word):
        results: self.WordAnalysis = self.morphology.analyze(JString(word))
        if len(list(results)) == 0:
            return ()
        res = str(list(results)[0].formatLong())
        res = res.split(' ')
        res = res[1].split('+')
        res = res[0].split(':')
        word = res[0]
        word_type = res[1].split('|')
        word_type = word_type[0]
        return [word, word_type]
        
    def reproduce_noun(self,word):
        number: List[JString] = [JString('A3sg'), JString('A3pl')]
        possessives: List[JString] = [JString('P1sg'), JString('P2sg'), JString('P3sg')]
        cases: List[JString] = [JString('Dat'), JString('Loc'), JString('Abl')]
            
        morphology: self.TurkishMorphology = (
        self.TurkishMorphology.builder().setLexicon(word).disableCache().build())
        item = morphology.getLexicon().getMatchingItems(word).get(0)
        
        ret_list = list()
        for number_m in number:
            for possessive_m in possessives:
                for case_m in cases:
                    for result in morphology.getWordGenerator().generate(item, number_m, possessive_m, case_m):
#                         print(str(result.surface))
                        ret_list.append(str(result.surface))
        return ret_list
    
    def reproduce_verb(self,word):
        positive_negatives: List[JString] = [JString(''), JString('Neg')]
        times: List[JString] = ['Imp', 'Aor', 'Past', 'Prog1', 'Prog2', 'Narr', 'Fut']
        people: List[JString] = ['A1sg', 'A2sg', 'A3sg', 'A1pl', 'A2pl', 'A3pl']
        
        morphology: self.TurkishMorphology = (self.TurkishMorphology.builder().setLexicon(word+'mak').disableCache().build())
        stem: str = word
            
        ret_list = list()
        for pos_neg in positive_negatives:
            for time in times:
                for person in people:
                    seq: java.util.ArrayList = java.util.ArrayList()
                    if pos_neg:
                        seq.add(JString(pos_neg))
                    if time:
                        seq.add(JString(time))
                    if person:
                        seq.add(JString(person))
                    results = list(morphology.getWordGenerator().generate(JString(stem),seq))
                    if not results:
                        continue
        #             print(' '.join(str(result.surface) for result in results))
                    for item in results:
#                         print(str(item.surface))
                        ret_list.append(str(item.surface))
        if len(ret_list) == 0:
            morphology: self.TurkishMorphology = (self.TurkishMorphology.builder().setLexicon(word).disableCache().build())
            stem: str = word

            for pos_neg in positive_negatives:
                for time in times:
                    for person in people:
                        seq: java.util.ArrayList = java.util.ArrayList()
                        if pos_neg:
                            seq.add(JString(pos_neg))
                        if time:
                            seq.add(JString(time))
                        if person:
                            seq.add(JString(person))
                        results = list(morphology.getWordGenerator().generate(JString(stem),seq))
                        if not results:
                            continue
            #             print(' '.join(str(result.surface) for result in results))
                        for item in results:
    #                         print(str(item.surface))
                            ret_list.append(str(item.surface))
        return ret_list

**If the data set is in a different path change news_path variable**

**If zemberek jar file is in different path change zemberek_path variable**

**read_doc is read and clear from noises words**

In [3]:
news_path = '1150haber'
zemberek_path = 'zemberek-full.jar'

create_corpus = CreateCorpus(zemberek_path)
filenames = create_corpus.read_filenames(news_path)
read_doc = create_corpus.read_files(filenames)

**Analyzed word is analyzed word inner list with the form of [word,type]**

In [4]:
analyzed_words = list()
for item in read_doc:
    result = create_corpus.analyze_word(item)
    if len(result) == 2:
        analyzed_words.append(result)

**This part is optional. If you need to examine analyzed words**

In [5]:
def write_text(content,filename):
    with open(filename,'w') as f:
        for i in content:
            f.write(i[0])
            f.write(',')
            f.write(i[1])
            f.write('\n')

write_text(analyzed_words,'new_corpusV2')

In [44]:
class Generate(object):
    word_size_dictionary = dict()
    def __init__(self):
        print("Generate object created")
    def word_size(self,word):
        alphabet = {'a':1,'b':2,'c':3,'ç':4,'d':5,'e':6,'f':7,
                'g':8,'ğ':9,'h':10,'ı':11,'i':12,'j':13,
                'k':14,'l':15,'m':16,'n':17,'o':18,'ö':19,
                'p':20,'r':21,'s':22,'ş':23,'t':24,'u':25,
                'ü':26, 'v':27, 'y':28, 'z':29}
        sum = 0
        for letter in word:
            if letter in alphabet:
                sum += alphabet[letter]
        return sum
    
    def generate_word(self,word_len):
        chosen_words = list()
        for key,value in self.word_size_dictionary.items():
            if value == word_len:
                chosen_words.append(key)
        return random.choice(chosen_words)
    
    def create_sentence(self,arr,sentence_len):
        noun = list()
        verb = list()
        adj = list()
        adv = list()
        num = list()
        pron = list()
        conj = list()
        for a in arr:
            if a[1] == 'Noun':
                noun.append(a[0])
            elif a[1] == 'Verb':
                verb.append(a[0])
            elif a[1] == "Adv":
                adv.append(a[0])
            elif a[1] == "Adj":
                adj.append(a[0])
            elif a[1] == "Num" or a[1] == "Nu":
                num.append(a[0])
            elif a[1] == "Det":
                pron.append(a[0])
            elif a[1] == "Conj":
                conj.append(a[0])

        a = True

        value = sentence_len


        if 50 >= value or value >= 800:
            print("This program can create sentences with a total value between 800 and 50!")


        elif (value) > 50 and (value) < 350:
            while a:
                indexP = randint(0, len(pron))
                valP = self.word_size(pron[indexP])

                indexN = randint(0, len(noun))
                valN = self.word_size(noun[indexN])

                valV = value - (valP + valN)
                if value < 0:
                    continue

                verbWord = ""

                for i in verb:
                    if self.word_size(i) == valV:
                        verbWord = i
                        print("Pronoun :", pron[indexP], valP)
                        print("Noun :", noun[indexN], valN)
                        print("Verb :", i, valV)
                        print("-->", pron[indexP], noun[indexN], i)
                        break
                if verbWord != "":
                    a = False
        elif value >= 350 and value < 450:
            while a:
                indexP = randint(0, len(pron))
                valP = self.word_size(pron[indexP])

                indexN = randint(0, len(noun))
                valN = self.word_size(noun[indexN])

                indexAj = randint(0, len(adj))
                valAj = self.word_size(adj[indexAj])

                valV = value - (valP + valN + valAj)
                if value < 0:
                    continue

                verbWord = ""

                for i in verb:
                    if self.word_size(i) == valV:
                        verbWord = i
                        print("Pronoun :", pron[indexP], valP)
                        print("Adjective :", adj[indexAj], valAj)
                        print("Noun :", noun[indexN], valN)
                        print("Verb :", i, valV)
                        print("-->", pron[indexP], adj[indexAj], noun[indexN], i)
                        break
                if verbWord != "":
                    a = False
        elif value >= 450 and value < 550:
            while a:
                indexP = randint(0, len(pron))
                valP = self.word_size(pron[indexP])

                indexN = randint(0, len(noun))
                valN = self.word_size(noun[indexN])

                indexAj = randint(0, len(adj))
                valAj = self.word_size(adj[indexAj])

                indexNm = randint(0, len(num))
                valNm = self.word_size(num[indexNm])

                valV = value - (valP + valN + valAj + valNm)

                verbWord = ""
                if value < 0:
                    continue

                for i in verb:
                    if self.word_size(i) == valV:
                        verbWord = i
                        print("Pronoun :", pron[indexP], valP)
                        print("Number :", num[indexNm], valNm)
                        print("Adjective: ", adj[indexAj], valAj)
                        print("Noun :", noun[indexN], valN)
                        print("Verb :", i, valV)
                        print("-->", pron[indexP], num[indexNm], adj[indexAj], noun[indexN], i)
                        break
                if verbWord != "":
                    a = False

        elif value >= 550 and value <= 650:
            while a:
                indexP = randint(0, len(pron))
                valP = self.word_size(pron[indexP])

                indexN = randint(0, len(noun))
                valN = self.word_size(noun[indexN])

                indexAj = randint(0, len(adj))
                valAj = self.word_size(adj[indexAj])

                indexNm = randint(0, len(num))
                valNm = self.word_size(num[indexNm])

                indexAv = randint(0, len(adv))
                valAv = self.word_size(adv[indexAv])

                valV = value - (valP + valN + valAj + valNm + valAv)
                if value < 0:
                    continue

                verbWord = ""

                for i in verb:
                    if self.word_size(i) == valV:
                        verbWord = i
                        print("Pronoun : ", pron[indexP], valP)
                        print("Number : ", num[indexNm], valNm)
                        print("Adjective : ", adj[indexAj], valAj)
                        print("Noun : ", noun[indexN], valN)
                        print("Adverb : ", adv[indexAv], valAv)
                        print("Verb : ", i, " ", valV)
                        print("-->", pron[indexP], num[indexNm], adj[indexAj], noun[indexN], adv[indexAv], i)
                        break
                if verbWord != "":
                    a = False

        elif value >= 650 and value < 800:
            while a:
                indexP = randint(0, len(pron))
                valP = self.word_size(pron[indexP])

                indexN1 = randint(0, len(noun))
                valN1 = self.word_size(noun[indexN1])

                indexC = randint(0, len(conj))
                valC = self.word_size(conj[indexC])

                indexN2 = randint(0, len(noun))
                valN2 = self.word_size(noun[indexN2])

                indexAj = randint(0, len(adj))
                valAj = self.word_size(adj[indexAj])

                indexNm = randint(0, len(num))
                valNm = self.word_size(num[indexNm])

                indexAv = randint(0, len(adv))
                valAv = self.word_size(adv[indexAv])

                valV = value - (valP + valN1 + valC + valN2 + valAj + valNm + valAv)
                if value < 0:
                    continue

                verbWord = ""

                for i in verb:
                    if self.word_size(i) == valV:
                        verbWord = i
                        print("Pronoun :", pron[indexP], valP)
                        print("Number :", num[indexNm], valNm)
                        print("Adjective :", adj[indexAj], valAj)
                        print("Noun :", noun[indexN1], valN1)
                        print("Conj :", conj[indexC], valC)
                        print("Noun :", noun[indexN2], valN2)
                        print("Adverb :", adv[indexAv], valAv)
                        print("Verb :", i, valV)
                        print("-->", pron[indexP], num[indexNm], adj[indexAj], noun[indexN1], conj[indexC], noun[indexN2],
                              adv[indexAv], i)
                        break
                if verbWord != "":
                    a = False



In [45]:
generate = Generate()
for word in read_doc:
    generate.word_size_dictionary[word] = generate.word_size(word)

Generate object created


**If you want to generate different word with different lenghts, change word_size variable**

In [46]:
word_size = 100
generate.generate_word(word_size)

'setlere'

**This part is check word size**

In [47]:
generate.word_size('okan')

50

**If you want to generate different sentences with different length, change sentence_value variable**

**If it took too long, stop this cell and please re-run**

In [61]:
sentence_value = 250
generate.create_sentence(analyzed_words,sentence_value)

Pronoun : bu 27
Noun : konferans 123
Verb : göster 100
--> bu konferans göster
