First of all, we import the relevant libraries. Besides the wikipedia library, we import devtrans for 

In [1]:
import wikipedia as wiki
import devtrans
import string
import re
import pandas as pd

Now we set some parameters whih can be easily canged to try new strategies for gathering the data. Since Devanagari text will be converted into 

In [2]:
accepted_chars = string.ascii_letters + " " +  "।" + "."
num_articles = 1500
langs = ["sa", "hi", "en", "ne"]
min_sent_len = 20
max_sent_len = 200

In [None]:
def drop_rows_by_value(df, n_rows, col, value):
    selected = df[df[col] == value]
    selected = selected[0:n_rows]
    cut = df[df[col] != value]
    return pd.concat([selected, cut])

def balance_entries(df, col):
    least = min(df[col].value_counts())
    for lang in langs:
        if df[col].value_counts()[lang] > least:
            df = drop_rows_by_value(df, least, "Tag", lang)
    return df

In [None]:
def process(text, lang):
    sentences = []
    if lang in ("sa", "hi", "ne"):
        text = devtrans.dev2hk(text).replace("ळ", "")
    cleaned = "".join([char for char in converted_to_iast if char in accepted_chars])
    sentences += [sentence for sentence in re.split("।|\.", cleaned) if len(sentence) > min_sent_len and len(sentence) < max_sent_len]
    return sentences

def download_articles(lang, how_many):
    wiki.set_lang(lang)
    articles = []
    tags = []
    topics = wiki.random(how_many)
    for topic in topics:
        try:
            sentences = process(wiki.WikipediaPage(topic).content, lang)
            for sentence in sentences:
                tags.append(lang)
            articles += sentences
        except (wiki.PageError, wiki.DisambiguationError):
            print("Unable to retrieve content for: " + topic)
    return (articles, tags)

In [None]:
final_sentences = []
final_tags = []
sents = []
tags = []

for lang in langs:
    final_sentences.append(download_articles(lang, num_articles))

for group in final_sentences:
    for i, sent in enumerate(group[0]):
        sents.append(sent)
        tags.append(group[1][i])

In [None]:
zipped = {"Sentence":sents, "Tag":tags}
df = balance_entries(pd.DataFrame(zipped), "Tag")

X, y = df.iloc[:, 0], df.iloc[:,1]

In [None]:
from sklearn.model_selection import train_test_split
from sklearn import feature_extraction
from sklearn import pipeline
from sklearn import linear_model
from sklearn import metrics

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [None]:
vectorizer = feature_extraction.text.TfidfVectorizer(ngram_range=(1,3), analyzer="char")

pipe_1r_r13 = pipeline.Pipeline([("vectorizer", vectorizer),
                                 ("clf", linear_model.LogisticRegression())])

pipe_1r_r13.fit(X_train, y_train)

y_predicted = pipe_1r_r13.predict(X_test)

acc = (metrics.accuracy_score(y_test, y_predicted)) * 100

In [None]:
new_test = ["This is a string", "mujhe mAluM hai ki tumhArI", "lakSyatAvacchedakAvacchinnaM na bhavati", "pramAnalakSaNaM syAt", "My name is something or other", "These things are all nice", "pramANaprameyasaMzayaprayojanadRSTAntAdi", "Dave went to sleep at hai hai hai merA", "pratiyogitA kA nAma kyA zuklatvam iti hai"]

new_pred = pipe_1r_r13.predict(new_test)
print(new_pred)
