In [4]:
import pandas as pd
import os
import string
import re
import emoji
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# COLLECTING DATASETS

# Combine gathered dataframes

In [5]:
data_path = "./data/"
l = os.listdir(data_path)
l_without_QADI = []
for x in l:
    if x[3:7] != "QADI":
        l_without_QADI.append(x)
l_without_QADI_dfs = []
for x in l_without_QADI:
    l_without_QADI_dfs.append(pd.read_csv(data_path+x))
# add dialect label for each dataset
for index, df in enumerate(l_without_QADI_dfs):
    l_without_QADI_dfs[index]['dialect'] = l_without_QADI[index][:2]
# drop unneeded columns
for index, df in enumerate(l_without_QADI_dfs):
    l_without_QADI_dfs[index].drop(columns=['Unnamed: 0'],inplace=True)
all_dfs = pd.concat(l_without_QADI_dfs, ignore_index=True)

In [6]:
all_dfs.tail()

Unnamed: 0,text,dialect
1118821,به محل كتب هانا؟,YE
1118822,شاحضر المدرسة الصيفية في جامعة هاواي.,YE
1118823,منين بنخطى ذاحين؟,YE
1118824,اثنين كبار وجاهل عمرة ثلاث سنين.,YE
1118825,في حمام في المحطة.,YE


# Cleaning

In [7]:
stopwords_list = stopwords.words('arabic')

In [9]:
def cleaning_for_each_line(tweet):
    """"
    Remove stop words
    Remove emotions
    Remove numbers
    Remove any non-arabic characters
    Normalize Arabic words
    Remove mentions
    Remove Links
    """
    tweet = str(tweet)
    tweet = re.sub(r"(?:\@|http?s?://|www)\S+", " ", tweet)
    tweet = re.sub(r'&amp;|&quot;|&gt;', ' ', tweet)
    tweet = re.sub(r'(.)\1+', r'\1', tweet)
    tweet = re.sub(r'[^اأإآء-ي0-9\s]', ' ', tweet) 
    tweet = re.sub(r'[0-9]+', ' ', tweet)
    tweet = emoji.demojize(tweet, language='ar')
    tweet = tweet.replace("#", " ").replace("_", " ")
    tweet = ' '.join([word for word in word_tokenize(tweet) if word not in stopwords_list])
    tweet = re.sub(r'[إأآا]', 'ا', tweet)
    tweet = re.sub(r'ة', 'ه', tweet)
    tweet = re.sub(r'ى', 'ي', tweet)
    return tweet.strip()

def remove_punctuation(tweet):
    """
    Remove arabic and English punctuation marks
    """
    arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
    english_punctuations = string.punctuation
    punctuations = arabic_punctuations + english_punctuations
    translator = str.maketrans('', '', punctuations)
    return tweet.translate(translator)

def cleaning(df):
    """"
    Apply data cleaning
    """
    df['text'] = df['text'].apply(cleaning_for_each_line)
    df['text'] = df['text'].apply(remove_punctuation)
    df['text'] = df['text'].str.replace('’', ' ', regex=False)
    df['text'] = df['text'].str.replace(r'[\s\n\t]+', ' ', regex=True).str.strip()
    df = df.dropna(subset=['text'])
    df = df[df['text'].str.strip().astype(bool)]
    df = df.drop_duplicates()
    df = df.reset_index(drop=True)
    return df

In [10]:
# Do data cleaning
all_dfs = cleaning(all_dfs)

In [11]:
# Combine 2 datasets
other_df = pd.read_csv("arabic_dialects_clean.csv")
other_df = other_df[['text','dialect']]
l = list(other_df.dialect.value_counts().keys())[:18]
other_df = other_df[other_df['dialect'].isin(l)]
other_df = cleaning(other_df)
all_dfs = pd.concat([all_dfs, other_df], ignore_index=True)
all_dfs = cleaning(all_dfs)

  other_df = pd.read_csv("arabic_dialects_clean.csv")


In [12]:
all_dfs.dialect.value_counts()

dialect
DZ    154832
EG    133236
SA    105368
AE    103124
BH     78597
LB     62433
SY     62009
KW     55882
PL     54279
JO     53092
TN     52674
LY     39487
QA     33004
SD     31903
OM     26962
IQ     20966
YE     11868
MA     11533
Name: count, dtype: int64

In [13]:
# Get data
qadi = pd.read_csv('QADI.csv')
qadi = qadi[['text','label']]
qadi = cleaning(qadi)
qadi.rename(columns={'label': 'dialect'}, inplace=True)

In [14]:
map={0:'OM', 1:'SD', 2:'SA', 3:'KW', 4:'QA', 5:'LB', 6:'JO', 7:'SY', 8:'IQ', 9:'MA', 10:'EG', 11:'PL', 12:'YE', 13:'BH', 14:'DZ', 
     15:'AE', 16:'TN', 17:'LY'}
qadi['dialect'] = qadi['dialect'].map(map)

In [15]:
all_Data = pd.concat([all_dfs, qadi], ignore_index=True)
all_Data = cleaning(all_Data)

In [16]:
all_Data.dialect.value_counts()

dialect
DZ    154851
EG    133236
SA    105446
AE    103145
BH     78629
LB     62433
SY     62036
KW     55909
PL     54301
JO     53109
TN     52713
LY     39487
QA     33031
SD     31903
OM     26990
IQ     20976
YE     11882
MA     11533
Name: count, dtype: int64

In [17]:
all_Data.shape

(1091610, 2)

In [18]:
all_Data.to_csv("COLLECTED_DATA_2.csv")