# 1 : Import required dependencies

In [None]:
import pandas as pd 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# 2: Discover your data

In [None]:
# read dataset
data = pd.read_csv("/content/drive/MyDrive/preprocessed_data.csv")
#data = data.dropna()


take samples to train ur model on it at the first time

In [None]:
sampled_data = data.sample(n=5000, random_state=42)

In [None]:
sampled_data.head()

Unnamed: 0.1,Unnamed: 0,text,dialect
25652,25652,يمكن تستغرب شء,2
52025,52025,خلصتي ولسه بتسقطي دي هوايه عندك بقي,0
5368,5368,صارت معايا مره,2
37970,37970,ومال تماره راه اغلب كانو الرباط شرو تماره بحكم...,3
117693,117693,الله يلعن السياسه يللي بتبعد الشرفاء بالبلد هي...,1


In [None]:
data["dialect"].shape

(147642,)

#2: Text representation

### 1: TF-IDF 
tf-idf is the preferred method in this situation since it evaluates the sentence as a complete entity rather than focusing on individual words, and takes into account the significance of each word based on its frequency of occurrence rather than its contextual relevance.

In [None]:
vectorizer = TfidfVectorizer(lowercase=True)
X = vectorizer.fit_transform(data["text"])

### 2:word2vec

In [None]:
## tokenization
tokenized_sentences = []
for sentence in sampled_data['text']:
    tokens = nltk.word_tokenize(sentence)
    tokenized_sentences.append(tokens)


In [None]:
## word2vec 
from gensim.models import Word2Vec
model = Word2Vec(tokenized_sentences, vector_size=100, window=5, min_count=1, workers=4)
sentence_vectors = []
for tokens in tokenized_sentences:
    vector = sum([model.wv[word] for word in tokens]) / len(tokens)
    sentence_vectors.append(vector)


#3: MODEL 





In [None]:
# SPLIT DATA
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(data["text"], data['dialect'], test_size=0.2, random_state=42)


### SVC 
It took approximately 90 minutes to train the entire dataset.


In [None]:
# make the pipline vectorization>>>model
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

model = SVC(kernel='linear', C=1, gamma='auto')

pipeline = Pipeline([
    ('tfidf',  TfidfVectorizer()),
    ('svc', model)
])
pipeline.fit(train_X, train_y)

## Lightgbm

In [None]:
#                                            svc 
from sklearn.pipeline import Pipeline
import lightgbm as lgb
model = lgb.LGBMClassifier()

lightgbm_pipeline = Pipeline([
    ('tfidf',  TfidfVectorizer()),

    ('lightgbm', model)
])
lightgbm_pipeline.fit(train_X, train_y)

#4: Evaluation

## 1: evaluate SVC

In [None]:
# evaluation
from sklearn.metrics import classification_report
predictions = pipeline.predict(test_X)

### Evaluation on 5000 samples

In [None]:
print(classification_report(test_y, predictions))


              precision    recall  f1-score   support

           0       0.73      0.90      0.80       398
           1       0.78      0.66      0.72       182
           2       0.64      0.72      0.68       244
           3       0.79      0.45      0.57        85
           4       0.86      0.33      0.48        91

    accuracy                           0.72      1000
   macro avg       0.76      0.61      0.65      1000
weighted avg       0.73      0.72      0.71      1000



### Evaluation on the whole data 

In [None]:
print(classification_report(test_y, predictions))


              precision    recall  f1-score   support

           0       0.82      0.92      0.86     11525
           1       0.86      0.82      0.84      5607
           2       0.78      0.82      0.80      7251
           3       0.90      0.64      0.75      2298
           4       0.85      0.57      0.68      2848

    accuracy                           0.82     29529
   macro avg       0.84      0.75      0.79     29529
weighted avg       0.82      0.82      0.82     29529



## 2: Evaluate LIGHTGBM
it took about 2 minutes in whole dataset but the result in production less accurate than svc

In [None]:
# evaluation
lightgbm_predictions = lightgbm_pipeline.predict(test_X)

In [None]:
print(classification_report(test_y, lightgbm_predictions))


              precision    recall  f1-score   support

           0       0.77      0.87      0.82     11525
           1       0.86      0.69      0.77      5607
           2       0.62      0.77      0.69      7251
           3       0.88      0.56      0.68      2298
           4       0.82      0.48      0.60      2848

    accuracy                           0.75     29529
   macro avg       0.79      0.67      0.71     29529
weighted avg       0.77      0.75      0.75     29529



#5: Save the model

In [None]:
import pickle
pickle.dump(pipeline ,open("/content/drive/MyDrive/SVC_model.pkl","wb"))


test saved model 

In [None]:
def dialect_predict(model_path,sentence):
  model = pickle.load(open(model_path,"rb"))
  sentence =[sentence]
  dialect = model.predict(sentence)

  if dialect == 0:
    country = "Egypt"

  elif  dialect == 1 :
    country = "Lebnanon"

  elif dialect ==   2 :
    country = "Libya"

  elif dialect == 3:
    country = "Morocco"

  else:
    country = "Sudan"
  
  return country    


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
model_path = "/content/drive/MyDrive/SVC_model.pkl"

In [None]:
dialect_predict(model_path,"ماما زمانها جايه")

'Egypt'