In [42]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from huggingface_hub import hf_hub_download
import joblib

In [52]:
def LoadLatinDependencies(vectorizer_id: str, model_id: str):
  """
    This function loads the serialized latin detection vectorizer and model.
  """
  vect = joblib.load(
    hf_hub_download(repo_id="MohamedAmineLayachi/North_Latin_Version", filename=vectorizer_id)
  )
  model = joblib.load(
    hf_hub_download(repo_id="MohamedAmineLayachi/North_Latin_Version", filename=model_id)
  )
  return vect, model

In [14]:
def LoadArabicDependencies(model_id: str, r_labels=False, only_pipeline=False):
  """
    This function loads the arabic detection model from the huggingface hub.
  """
  labels = ['algeria','tunisia','morocco','egypt']
  label2id = {label: idx for idx, label in enumerate(labels)}
  id2label = {idx: label for idx, label in enumerate(labels)}
  tokenizer = AutoTokenizer.from_pretrained(model_id)
  model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=len(labels), id2label=id2label, label2id=label2id)
  pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

  if r_labels is True and only_pipeline is False:
    return pipe, tokenizer, model, {'labels': labels, 'label_id': label2id, 'id_label': id2label}
  elif r_labels is False and only_pipeline is False:
    return pipe, tokenizer, model
  elif only_pipeline is True:
    return pipe

In [12]:
def LatinPrediction(text: str, vectorizer, model):
  """
   This function takes text in latin characters as input and classify it in the appropriate class.
  """
  feature_vector = vectorizer.transform([text])
  return model.predict(feature_vector)

In [13]:
def ArabicPrediction(text: str, pipeline, show_score=False):
  """
   This function takes text in Arabic characters as input and classify it in the appropriate class.
  """
  if show_score:
    return pipeline(text)
  else:
    return pipeline(text)[0]['label']

In [44]:
class DialectClassifier:
  """
    This is the DialectClassfier, a wrapper object for all the models used in this project.
  """
  def __init__(self):
    self.LatinVect, self.LatinModel = LoadLatinDependencies("North_Latin_CountVectorizer.joblib", "North_Latin_MNB_Classifier.joblib")
    self.ArabicPipe = LoadArabicDependencies("Oelbourki/northafrica-arabizi-dialect-classifier", only_pipeline=True)

  def predictArabic(self, text):
    """
     This function takes text in Arabic characters as input and classify it in the appropriate class.
    """
    prediction = ArabicPrediction(text, self.ArabicPipe)
    print(prediction)

  def predictLatin(self, text):
    """
      This function takes text in latin characters as input and classify it in the appropriate class.
    """
    prediction = LatinPrediction(text, self.LatinVect, self.LatinModel)
    print(prediction[0])

In [49]:
DC = DialectClassifier()

In [50]:
DC.predictArabic('لباس')

morocco


In [51]:
DC.predictLatin('labass')

morocco
