In [1]:
import pandas as pd
import re
from google.colab import drive
import string
import pickle
import numpy as np
import gensim

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

from numpy.core.fromnumeric import shape
from keras.layers import Dense, Input, Dropout, Flatten, Embedding, CuDNNLSTM, LSTM
from keras.models import Sequential

In [2]:
drive.mount('/gdrive')

Mounted at /gdrive


In [3]:
DATASET_PATH = '/gdrive/MyDrive/text_dialect_dataset.pkl'

In [None]:
class WORD2VECTOR:
  def __init__(self, vector_size, iterations):
    # Size of vector that represents the word
    self.vector_size = vector_size
    self.iterations = iterations
    self.word2vec_model = None
    self.w2v_dict = None
    self.num_of_unique_words = None
    self.gensim_weight_matrix = None

  # Build word2vec model
  def create_model(self, data):
    self.word2vec_model = gensim.models.Word2Vec(data, min_count=3, size=self.vector_size, iter=self.iterations)
    self.initialize_w2v_properties()
    return self.word2vec_model
  
  # Create dict of words and their weights
  def initialize_w2v_properties(self):
    self.w2v_dict = dict(zip(self.word2vec_model.wv.index2word, self.word2vec_model.wv.syn0))
    # Number of unique words in model
    self.num_of_unique_words = len(self.word2vec_model.wv.vocab)

  # Create Embedding matrix for words
  def create_embedding_matrix(self, tokenizer):
    self.gensim_weight_matrix = np.zeros((self.num_of_unique_words ,self.vector_size))
    for word, index in tokenizer.word_index.items():
      if index < self.num_of_unique_words - 1:  
        if word in self.word2vec_model.wv.vocab:
          self.gensim_weight_matrix[index] = self.word2vec_model[word]
        else:
          self.gensim_weight_matrix[index] = np.zeros(300)
    return self.gensim_weight_matrix
  
  # Get mean of each sentence to have one vector of N dimension representing the sentence
  def transform(self, data):
    return np.array([
      np.mean([self.w2v_dict[w] for w in words if w in self.w2v_dict] or [np.zeros(self.vector_size)], axis=0) for words in data
    ])

  def save_model(self, path, model):
    model.save(path)
  
  def load_model(self, path):
    self.word2vec_model = gensim.models.Word2Vec.load(path)
    self.initialize_w2v_properties()
    return self.word2vec_model

In [8]:
class DataPreProcessing:
  NON_ARABIC_PATTERN = r'[^؀-ۿ]+'
  # Just incase we have arabian username
  MENTION_PATTERN = r'@\S+'
  HASHTAG_PATTERN = r'#\S+'
  # Normalize repeating
  REPEATING_CHARACTERS = r'(.)\1+'
  
  def __init__(self):
    self.tokenizer = None
  
  # Apply the regular expressions on string and strip it
  def preprocess_string(self, text):
    text = re.sub(self.HASHTAG_PATTERN, ' ', text)
    text = re.sub(self.MENTION_PATTERN, ' ',  text)
    text = re.sub(self.NON_ARABIC_PATTERN, ' ', text)
    text = re.sub(self.REPEATING_CHARACTERS, r'\1', text)
    #text = normalize.normalize_searchtext(text)
    processed_text = text.strip()
    return processed_text

  # Tokenize sentence by word level
  def tokenize_sentence(self, sentence):
    return sentence.split()
  
  # Create pad tokenizer to have same vector length for each word 
  def create_padding_tokenizer(self, num_of_unique_words):
    self.tokenizer = Tokenizer(num_of_unique_words)
    return self.tokenizer

  def pad_data(self, data, longest_sequence, tokenizer):
    data = tokenizer.texts_to_sequences(data) # this converts texts into some numeric sequences 
    data_pad = pad_sequences(data, maxlen=longest_sequence, padding='post') # this makes the length of all numeric sequences equal
    return data_pad

In [10]:
def run_processing():
  # Load dataset
  with open(DATASET_PATH, 'rb') as f:
    dialect_df = pickle.load(f)
  
  # First let's check for nulls and data types
  print(dialect_df.info())
  print('---------------------------\n')

  # Check for duplicates
  print('Duplicates =', dialect_df.duplicated().sum())
  print('---------------------------\n')

  # Check for class count
  print(dialect_df['dialect'].value_counts())
  print('---------------------------')
  
  # Load Preprocesing class
  dp = DataPreProcessing()

  # Replacing old string with the processed one
  dialect_df['text'] = dialect_df['text'].apply(dp.preprocess_string)

  # Add new column for the tokenized string in dataframe
  dialect_df['tokenized_string'] = dialect_df['text'].apply(dp.tokenize_sentence)

  # Separate Feature from label
  X = dialect_df['text'].values

  # Build vectorizer
  vectorizer = TfidfVectorizer(min_df=4, max_df=.25)
  vectorizer.fit(X)
 
  # Build word2vec model
  X = dialect_df['tokenized_string']
  wv = WORD2VECTOR(300, 100)
  #wv_model = wv.create_model(X)
  wv_model = wv.load_model('/gdrive/MyDrive/model4.bin')

  # Build tokenizer for padding
  longest_sequence = max(X.apply(len)) # Longest sentence
  X = dialect_df['text']
  unique_words_count = wv.num_of_unique_words # Number of uniquer words in vocab
  tokenizer = dp.create_padding_tokenizer(unique_words_count)
  tokenizer.fit_on_texts(X)
  
  # Create embedding matrix
  embed_matrix = wv.create_embedding_matrix(tokenizer)

  # Saving some stuff for further use
  # Save New dataframe
  #with open('/gdrive/MyDrive/processed_dialect_dataset.pkl', 'wb') as f:
  #  pickle.dump(dialect_df, f)
  
  # Save vecotrizer
  #with open('/gdrive/MyDrive/tfidf_vecctorizer.pkl', 'wb') as f:
  #  pickle.dump(vectorizer, f)
  
  # Save word2vec model
  #wv.save_model('/gdrive/MyDrive/model4.bin', wv_model)

  # Save tokenizer
  #with open('/gdrive/MyDrive/tokenizer.pkl', 'wb') as f:
  #  pickle.dump(tokenizer, f)
  
  # Save embedding matrix
  #with open('/gdrive/MyDrive/embedding_matrix.pkl', 'wb') as f:
  #  pickle.dump(embed_matrix, f)

In [None]:
#run_processing()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458197 entries, 0 to 458196
Data columns (total 3 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   id       458197 non-null  object
 1   dialect  458197 non-null  object
 2   text     458197 non-null  object
dtypes: object(3)
memory usage: 10.5+ MB
None
---------------------------

Duplicates = 0
---------------------------

EG    57636
PL    43742
KW    42109
LY    36499
QA    31069
JO    27921
LB    27617
SA    26832
AE    26296
BH    26292
OM    19116
SY    16242
DZ    16183
IQ    15497
SD    14434
MA    11539
YE     9927
TN     9246
Name: dialect, dtype: int64
---------------------------


  app.launch_new_instance()
