In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('COLLECTED_DATA_2.csv')
df = df.drop(columns=['Unnamed: 0'])

In [3]:
df['word_count'] = df['text'].apply(lambda x: len(str(x).split()))

In [4]:
df.dialect.value_counts()

dialect
DZ    154851
EG    133236
SA    105446
AE    103145
BH     78629
LB     62433
SY     62036
KW     55909
PL     54301
JO     53109
TN     52713
LY     39487
QA     33031
SD     31903
OM     26990
IQ     20976
YE     11882
MA     11533
Name: count, dtype: int64

In [5]:
df['word_count'].describe()

count    1.091610e+06
mean     1.475619e+01
std      1.152860e+02
min      1.000000e+00
25%      5.000000e+00
50%      8.000000e+00
75%      1.400000e+01
max      2.531300e+04
Name: word_count, dtype: float64

In [6]:
max_l = 14
l_row = []
for index, row in df.iterrows():
    words = str(row['text']).split()
    word_count = len(words)
    if word_count >= max_l:
        for i in range(0, word_count, max_l):
            chunk = words[i:i+max_l]
            if len(chunk) >= 5:  
                l_row.append({'text': ' '.join(chunk), 'dialect': row['dialect']})
    elif word_count >= 5:
        l_row.append({'text': row['text'], 'dialect': row['dialect']})

In [7]:
df_new = pd.DataFrame(l_row)

In [8]:
df_new.drop_duplicates(subset=['text','dialect'], inplace=True)

In [9]:
df_new['word_count'] = df_new['text'].apply(lambda x: len(str(x).split()))

In [10]:
df_new.dialect.value_counts()

dialect
DZ    309530
EG    208077
SA    145195
AE    126255
LB     69521
KW     66094
SY     64522
JO     60224
BH     56809
PL     53033
LY     38501
SD     38141
TN     36855
QA     35582
OM     23928
IQ     19932
MA     11082
YE      9608
Name: count, dtype: int64

In [11]:
# samples based on iq 18000

In [12]:
Khaleeji = ['SA','AE','KW','BH','QA','OM']
Levantine = ['LB','JO','PL','SY']
Maghrebi = ['MA','DZ','TN']
Iraqi = ['IQ']
Lybian = ['LY']
Egyptian = ['EG']
sudani = ['SD']

In [13]:
Iraqi = df_new[df_new.dialect=='IQ']

In [14]:
Iraqi_distribution = Iraqi['word_count'].value_counts()/Iraqi['word_count'].count()

In [15]:
distribution = Iraqi_distribution.to_dict()

In [17]:
def sample_distribution(df, num, distribution):
    samples_per_class = {k: int(v * num) for k, v in distribution.items()}
    df_stratified = pd.DataFrame()
    for word_count_value, count in samples_per_class.items():
        subset = df[df['word_count'] == word_count_value]        
        sampled = subset.sample(n=min(count, len(subset)), random_state=42)
        df_stratified = pd.concat([df_stratified, sampled])
    return df_stratified

In [18]:
num_samples = 18000

In [19]:
sa_df = sample_distribution(df_new[df_new.dialect=='SA'].reset_index(drop=True), num_samples/6, distribution)
ae_df = sample_distribution(df_new[df_new.dialect=='AE'].reset_index(drop=True), num_samples/6, distribution)
kw_df = sample_distribution(df_new[df_new.dialect=='KW'].reset_index(drop=True), num_samples/6, distribution)
bh_df = sample_distribution(df_new[df_new.dialect=='BH'].reset_index(drop=True), num_samples/6, distribution)
qa_df = sample_distribution(df_new[df_new.dialect=='QA'].reset_index(drop=True), num_samples/6, distribution)
om_df = sample_distribution(df_new[df_new.dialect=='OM'].reset_index(drop=True), num_samples/6, distribution)
Khaleeji = pd.concat([sa_df, ae_df, kw_df, bh_df, qa_df, om_df], ignore_index=True)

In [20]:
lb_df = sample_distribution(df_new[df_new.dialect=='LB'].reset_index(drop=True), num_samples/4, distribution)
jo_df = sample_distribution(df_new[df_new.dialect=='JO'].reset_index(drop=True), num_samples/4, distribution)
pl_df = sample_distribution(df_new[df_new.dialect=='PL'].reset_index(drop=True), num_samples/4, distribution)
sy_df = sample_distribution(df_new[df_new.dialect=='SY'].reset_index(drop=True), num_samples/4, distribution)
Levantine = pd.concat([lb_df, jo_df, pl_df, sy_df], ignore_index=True)

In [21]:
MA_df = sample_distribution(df_new[df_new.dialect=='MA'].reset_index(drop=True), num_samples/3, distribution)
DZ_df = sample_distribution(df_new[df_new.dialect=='DZ'].reset_index(drop=True), num_samples/3, distribution)
TN_df = sample_distribution(df_new[df_new.dialect=='TN'].reset_index(drop=True), num_samples/3, distribution)
Maghrebi = pd.concat([MA_df, DZ_df, TN_df], ignore_index=True)

In [22]:
IQ_df = sample_distribution(df_new[df_new.dialect=='IQ'].reset_index(drop=True), num_samples, distribution)
EG_df = sample_distribution(df_new[df_new.dialect=='EG'].reset_index(drop=True), num_samples, distribution)
LY_df = sample_distribution(df_new[df_new.dialect=='LY'].reset_index(drop=True), num_samples, distribution)
SD = sample_distribution(df_new[df_new.dialect=='SD'].reset_index(drop=True), num_samples, distribution)

In [23]:
Khaleeji['dialect'] = 'Khaleeji'
Levantine['dialect'] = 'Levantine'
Maghrebi['dialect'] = 'Maghrebi'
IQ_df['dialect'] = 'Iraqi'
LY_df['dialect'] = 'Lybia'
EG_df['dialect'] = 'Egyptian'
SD['dialect'] = 'Sudani'

In [24]:
all_dialects = pd.concat([Khaleeji, Levantine, Maghrebi, IQ_df, LY_df, EG_df, SD], ignore_index=True)

In [25]:
all_dialects.drop_duplicates(subset=['text','dialect','word_count'], inplace=True)

In [29]:
all_dialects.to_csv('FINAL_data2.csv', index=False)