# Data Importing & Cleaning

In [None]:
# Data Importing
import pandas as pd

data = pd.read_csv('../input/arabic-idioms-dataset/Dataset.csv', sep=',')
data

In [None]:
data['class'].value_counts()

In [None]:
data['target'].value_counts()

# Data Modeling

In [None]:
X = data[['class']] 
y = data['target']
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

print('Training Data Shape:', X_train.shape)
print('Testing Data Shape: ', X_test.shape)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(solver='lbfgs')

lr_model.fit(X_train, y_train)

In [None]:
from sklearn import metrics

predictions = lr_model.predict(X_test)

print(metrics.confusion_matrix(y_test,predictions))

print(metrics.classification_report(y_test,predictions))

print(metrics.accuracy_score(y_test,predictions))

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
predictions = nb_model.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

In [None]:
from sklearn.svm import SVC
svc_model = SVC(gamma='auto')
svc_model.fit(X_train,y_train)

predictions = svc_model.predict(X_test)
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))
print(metrics.accuracy_score(y_test,predictions))

In [None]:
from sklearn.model_selection import train_test_split

X = data['idiom']  
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
y_train.value_counts(), y_test.value_counts()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

In [None]:
pd.DataFrame(X_train_tfidf)[0]

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()

X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', clf),])


text_clf.fit(X_train, y_train)  


predictions = text_clf.predict(X_test)
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
print(metrics.classification_report(y_test,predictions)) 

In [None]:
print(metrics.accuracy_score(y_test,predictions))

In [None]:
text_clf.predict(['السوق السوداء'])[0]

# Feature Engineering

In [None]:
data

In [None]:
from nltk.tokenize import word_tokenize

# Adding number of words as one of our features
data["N"] = [len(word_tokenize(i)) for i in data["idiom"]]
data

In [None]:
X = data['idiom']  
y = data['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=70)

count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer()

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape


vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train) # remember to use the original X_train set
X_train_tfidf.shape

clf = LinearSVC()
clf.fit(X_train_tfidf,y_train)

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),])


pipeline.fit(X_train, y_train)  


predictions = pipeline.predict(X_test)
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))

In [None]:
import csv
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional,SimpleRNN
import nltk

In [None]:
vocab_size = 5000
embedding_dim = 64
max_length = 200
oov_tok = '<OOV>' #  Out of Vocabulary
training_portion = 0.8
articles = data['idiom']
labels = data['target']

len(labels),len(articles)

In [None]:
labels[0],articles[0]

In [None]:
train_size = int(len(articles) * training_portion)

train_articles = articles[0: train_size]
train_labels = labels[0: train_size]

validation_articles = articles[train_size:]
validation_labels = labels[train_size:]

In [None]:
print('train_articles' ,len(train_articles))
print('train_labels', len(train_labels))
print('validation_articles', len(validation_articles))
print('validation_labels', len(validation_labels))

In [None]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_articles)
word_index = tokenizer.word_index

In [None]:
word_index

In [None]:
tokenizer.texts_to_sequences(['كتب الولد الواجب.'])


In [None]:
train_sequences = tokenizer.texts_to_sequences(train_articles)
len(train_sequences)

In [None]:
train_sequences[0]


In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_length,)
len(train_padded),len(train_padded[0])

In [None]:
validation_sequences = tokenizer.texts_to_sequences(validation_articles)
validation_padded = pad_sequences(validation_sequences, maxlen=max_length)
set(labels)