In [2]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib

In [3]:
from collections import defaultdict

In [4]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**NOTE:** This is trained with a dataset that containes text only in the Moroccan and Tunisian Dialects.

In [6]:
### LOADING THE LATIN DATASET
data = pd.read_csv('NorthLatinData.csv', encoding='utf-8')

In [7]:
data.head()

Unnamed: 0,text,class
0,to9tel.,tunisia
1,sedd femmek,morocco
2,knt kanfakar fchi 7aja aktar itara,morocco
3,fnihaya dyal simana idan,morocco
4,L animateur rahou masset. Brabi bedlouh,tunisia


In [8]:
def clean(text: str):
  """
    clean takes text as input and removes punctuation and special characters.
  """
  text = text.replace('\n', '')
  text = text.translate(str.maketrans({':': '', '/': '', ')': '', '(': '', '!': '', '?': '', '_': '',
                                       '.': '', ',':'', ';': '', '<': '', '>': ''}))
  text = text.lower()
  return text

In [9]:
tokens = defaultdict(list)
## CONSTRUCTING THE DICTIONARY THAT EXCLUDES STOP WORDS
for i in range(data.shape[0]):
  line = data['text'][i]
  label = data['class'][i]
  line = clean(line)
  line_tokens = [t for t in word_tokenize(line) if not t in ['w', 'o', 'f', '3', 'fi', 'l']]
  tokens[label].extend(line_tokens)

In [10]:
print(tokens)

defaultdict(<class 'list'>, {'tunisia': ['to9tel', 'animateur', 'rahou', 'masset', 'brabi', 'bedlouh', 'a7ssen', '9aney', 'touness', 'maset', 'laset', 'amin', 'tounsia', 'ma5yeb', 'wejhek', 'm3alem', 'sami', 'wallah', 'kthba', 'nty', 'hassen', 'doss', 'mn', 'ahsn', 'etwensa', 'i', 'ghnehom', 'yaajbni', 'keep', 'on', 'chmeta', 'nabara', 'mala', '5orda', 'ourorou', 'bravo', 'walid', 'tounsi', 'sid', 'rjal', 'pfff', 'miskina', 'weld', 'l7ram', 'mshny', 'lfirst', 'bravo', 'karim', 'bravo', 'bravo', 'mamstoooooo', 'lotfi', 'masa7', 'wejhek', 'karim', 'maste', 'barchaaaaaaaaaaaaaa', 'bravo', '3al', '7iiiit', 'mariem', 'mat7chemch', '5amjaa', 'hhh', '7afletha', 'rak', 'laset', 'ena', 'n7ebha', 'choufli', 'sa5if', 'tfouh', 'alikom', 'hhhhhhhhhhhhh', 'saadia', 'mosbah', 'lol', 'masrouka', 'fekra', 'men', 'aand', 'mikeposnervevo', 'ni', 'took', 'a', 'pill', 'in', 'ibiza', 'm3aaalem', 'weli', '3ala', 'lseno', 'y9olou', 'rabi', 'yechfilek', 'el', 'wled', 'a7la', 'moumathla', 'tounsiya', 'barcha', 

In [11]:
### FREQUENCY DISTRIBUTION OF WORDS IN BOTH DIALECTS
for class_label, class_tokens in tokens.items():
  print(class_label)
  fd = FreqDist(class_tokens)
  print(fd.most_common(20))

tunisia
[('bravo', 382), ('masit', 130), ('maset', 83), ('mala', 82), ('walid', 63), ('rabi', 62), ('ya', 59), ('la', 57), ('de', 55), ('rak', 53), ('m3alem', 49), ('3la', 49), ('et', 49), ('les', 47), ('y', 46), ('mouch', 45), ('tounes', 45), ('yeser', 45), ('le', 43), ('wlh', 43)]
morocco
[('wach', 768), ('dyal', 541), ('ghadi', 517), ('ana', 504), ('chi', 494), ('3la', 395), ('mn', 393), ('hadchi', 366), ('walakin', 344), ('bzaf', 257), ('rah', 239), ('had', 236), ('bach', 227), ('lik', 206), ('nta', 204), ('daba', 204), ('m3a', 191), ('kan', 188), ('ah', 172), ('machi', 172)]


In [12]:
def clean_data(data_frame):
  """
  clean_data takes a data_frame (pandas.Dataframe) and the feature vector (which is text in this case) and cleans it using the clean(text) function. Then, the function assembles the feature vector
  with its correspoding label (morocco, tunisia) in a csv file that can be used to train the model.
  """
  new_df = [['text', 'class']]
  for i in range(data.shape[0]):
    line = data['text'][i]
    label = data['class'][i]
    line = clean(line)
    new_df.append([line, label])

  new_df = pd.DataFrame(new_df)
  new_df.to_csv('NorthLDC.csv', index=False, header=False, encoding='utf-8')

clean_data(data)

In [13]:
new_data = pd.read_csv('NorthLDC.csv', encoding='utf-8')

In [14]:
new_data

Unnamed: 0,text,class
0,to9tel,tunisia
1,sedd femmek,morocco
2,knt kanfakar fchi 7aja aktar itara,morocco
3,fnihaya dyal simana idan,morocco
4,l animateur rahou masset brabi bedlouh,tunisia
...,...,...
12940,gollia wach nta 9add tdir khdmtk bkoll ti9a,morocco
12941,mskin slm mr,tunisia
12942,bravo mo9ded,tunisia
12943,wach kadan bli hada imkan il3ab chi dawr fhadc...,morocco


In [15]:
new_data = new_data.dropna()

In [16]:
X_train, X_test, y_train, y_test = train_test_split(new_data['text'], new_data['class'], test_size=0.2)

In [17]:
### IMPORTING THE MODEL AND THE VECTORIZERS
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [18]:
### IMPORTING THE METRICS TO EVALUATE THE MODEL
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

In [19]:
### BUILDING A COUNT VECTORIZER AND APPLYING IT ON THE TRAINING SET
v = CountVectorizer()
vtrain = v.fit_transform(X_train)

In [20]:
### BUILDING AND TRAINING THE NAIVE-BAYES MODEL USING TWO APPROACHES, ONE WITH MULTINOMIAL AND THE OTHER WITH BERNOULLI
MNBC = MultinomialNB().fit(vtrain, y_train)
BNBC = BernoulliNB().fit(vtrain, y_train)

In [21]:
def evaluate(title, model, vectorizer, X_test, y_test):
  """
    evaluete is a function that used for model evaluation.
  """
  print(title)
  vtest = vectorizer.transform(X_test)
  y_hat = model.predict(vtest)

  precision = precision_score(y_test, y_hat, pos_label="morocco")
  recall = recall_score(y_test, y_hat, pos_label="morocco")
  accuracy = accuracy_score(y_test, y_hat)
  f1 = f1_score(y_test, y_hat, pos_label="morocco")

  print(f"Precision = {precision}, Recall = {recall}, Accuracy = {accuracy}, F1 = {f1}")

In [22]:
evaluate("Multinomial NB Train", MNBC, v, X_train, y_train)
evaluate("Multinomial NB Test", MNBC, v, X_test, y_test)
evaluate("Bernoulli NB Train", BNBC, v, X_train, y_train)
evaluate("Bernoulli NB Test", BNBC, v, X_test, y_test)

Multinomial NB Train
Precision = 0.9948035487959442, Recall = 0.9987275734826314, Accuracy = 0.9950743673942438, F1 = 0.996761699155502
Multinomial NB Test
Precision = 0.9359582542694497, Recall = 0.9959616355376073, Accuracy = 0.9447663190421012, F1 = 0.9650281242357543
Bernoulli NB Train
Precision = 0.9023998162820072, Recall = 1.0, Accuracy = 0.9179061232373962, F1 = 0.948696281989377
Bernoulli NB Test
Precision = 0.8620539599651871, Recall = 1.0, Accuracy = 0.8775589030513712, F1 = 0.925917270390278


In [23]:
### IMPORTING THE PICKLE LIBRARY TO SERIALIZE THE MODEL AND THE VECTORIZER
import joblib

In [24]:
### SERIALIZING THE MODEL AND THE VECTORIZER
joblib.dump(MNBC, open("North_Latin_MNB_Classifier.joblib", 'wb'))
joblib.dump(v, open("North_Latin_Vectorizer.joblib", 'wb'))

In [26]:
### LOADING THE MODEL AND THE VECTORIZER
vect = joblib.load(open("North_Latin_Vectorizer.joblib", 'rb'))
model = joblib.load(open("North_Latin_MNB_Classifier.joblib", 'rb'))

In [27]:
### A TEST PREDICTION
model.predict(vect.transform(["labass"]))

array(['morocco'], dtype='<U7')