In [None]:
# pip install nltk.tokenize

: 

In [1]:
# Load custom Kannada stopwords from a file or define them manually
kannada_stopwords = set(["ಅದು", "ಈ", "ಅವರ", "ನಾನು", "ನಮ್ಮ", "ನೀವು", "ಮತ್ತು", "ಹಾಗು"])

In [2]:
# Tokenizing the sentencing in order to purge the stopwords

def tokenize_kannada(text):
    # Replace punctuation with space to separate tokens
    punctuations = "।!?.,;:'\"()-"
    for char in punctuations:
        text = text.replace(char, " ")
    tokens = text.split()
    return tokens

In [4]:
def remove_stopwords_kannada(text, stopword_list):
    tokens = tokenize_kannada(text)
    filtered_tokens = [word for word in tokens if word not in stopword_list]
    return ' '.join(filtered_tokens)


In [5]:
# Example Kannada text
text = "ನಾನು ಶಾಲೆಗೆ ಹೋಗಿದ್ದೇನೆ ಮತ್ತು ನಾನು ಓದುತ್ತಿದ್ದೇನೆ"
processed_text = remove_stopwords_kannada(text, kannada_stopwords)
print(f"Processed Text: {processed_text}")


Processed Text: ಶಾಲೆಗೆ ಹೋಗಿದ್ದೇನೆ ಓದುತ್ತಿದ್ದೇನೆ


In [None]:
#similar manual approach can be used for Parts Of Speech too

In [6]:
pos_tags = {
    "ನಾನು": "PRON",     # Pronoun
    "ಓದುತ್ತಿದ್ದೇನೆ": "VERB",  # Verb
    "ಶಾಲೆ": "NOUN",     # Noun
    "ಮತ್ತು": "CONJ",     # Conjunction
    "ಅವರು": "PRON"      # Pronoun
}

In [7]:
def pos_tag_kannada(text, tag_dict):
    tokens = tokenize_kannada(text)  # Use the manual tokenizer
    tagged_tokens = []
    for token in tokens:
        # If the word is in the predefined POS dictionary, tag it
        pos_tag = tag_dict.get(token, "UNK")  # "UNK" stands for unknown POS tag
        tagged_tokens.append((token, pos_tag))
    return tagged_tokens


In [8]:
# Example Kannada sentence
text = "ನಾನು ಶಾಲೆಗೆ ಹೋಗಿದ್ದೇನೆ"
pos_tags = pos_tag_kannada(text, pos_tags)
print(f"POS Tags: {pos_tags}")


POS Tags: [('ನಾನು', 'PRON'), ('ಶಾಲೆಗೆ', 'UNK'), ('ಹೋಗಿದ್ದೇನೆ', 'UNK')]


In [9]:
#based on common patterns in the language, some guessing can be done for the parts of speech
def guess_pos_kannada(word):
    # Use suffix rules to guess POS tags for verbs and nouns
    if word.endswith("ತ್ತಿದ್ದೇನೆ") or word.endswith("ಬಹುದೇ"):
        return "VERB"
    elif word.endswith("ಗೆ") or word.endswith("ದಲ್ಲಿ"):
        return "NOUN"
    else:
        return "UNK"

def pos_tag_with_rules(text):
    tokens = tokenize_kannada(text)
    tagged_tokens = [(word, guess_pos_kannada(word)) for word in tokens]
    return tagged_tokens


In [10]:
text = "ನಾನು ಶಾಲೆಗೆ ಹೋಗುತ್ತಿದ್ದೇನೆ"
pos_tags = pos_tag_with_rules(text)
print(f"POS Tags: {pos_tags}")


POS Tags: [('ನಾನು', 'UNK'), ('ಶಾಲೆಗೆ', 'NOUN'), ('ಹೋಗುತ್ತಿದ್ದೇನೆ', 'VERB')]
