### Word Frequency and N-Grams

#### Import modules

In [92]:
import sys, os
import pandas as pd
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from util import Util

In [93]:
cleaned_dir = "../data/cleaned"
final_dir = "../data/final"
file_name = "TIKVAH-ETHIOPIA"
util = Util()

In [94]:
df = pd.read_csv(f"{cleaned_dir}/{file_name}.csv", index_col='id')
df.head()

Unnamed: 0_level_0,text,date,hashtags,emojis,symbols,links
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
83647,ለጥያቄና መልስ ውድድራችን ዝግጁ ነዎት? 3ኛ ዙር የጥያቄና መልስ ውድድር...,2023-12-19T11:08:29,"['#አቢሲንያ_ባንክ', '#telegram', '#giveaway', '#con...",,፤,['https://t.me/BoAEth']
83650,ስትሮክ ፦ ስትሮክ በተለያየ ምከንያት ኦክስጅንን ለአእምሮ ሚያደርሰው የደ...,2023-12-19T11:08:40,['#እንድታውቁት'],☑,-።።።-።-።-።,[]
83653,ዛሬ የሕዝብ ተወካዮች ምክር ቤት ባካሄደው መደበኛ ስብሰባው ወ/ሮ ምእላተ...,2023-12-19T11:09:18,['#NEBE'],,።።-።።-።-።።-።-።-።-።-።።።።።።።-።-።-።።,[]
83655,ኬኖ ለሚባለው የቁጥር ግመታ ጨዋታ ፈቃድ አልሰጠሁም የብሄራዊ ሎተሪ አስተ...,2023-12-19T11:25:42,[],,"""''""-""""፤""""።""""።",[]
83657,ታህሳስ 7 ቀን 2016 ኣ/ም ምሽት አዲስ አበባ ቦሌ ኤድና ሞል አካባቢ ...,2023-12-19T14:11:36,"['#AddisAbaba', '#አባተ_አበበ', '#መሳሪያ_በታጠቀ']",,።።።።።።፤።,[]


#### Word Frequency

In [95]:
# function to convert token to list and unpack
from functools import reduce
import operator


def term_freq(x):
    # Tokenize each text into words
    word_lists = [str(text).split() for text in x.tolist()]
    
    # Flatten the list of lists
    words = reduce(operator.add, word_lists)
    
    return words

In [96]:
# function to count the term/word and create a dataframe for frequencies
from collections import Counter

def counting(x, y):
  counter = Counter(x)
  most_occurrences = counter.most_common()
  count_df = pd.DataFrame(most_occurrences, columns = ['Word', 'Count'])
  return count_df


In [97]:
df_frequency = counting(term_freq(df['text']), 2)
df_frequency.head()

Unnamed: 0,Word,Count
0,ነው,240
1,እና,220
2,ላይ,164
3,ኢትዮጵያ,126
4,ጋር,89


#### Save word frequency

In [98]:
df_frequency.to_csv(f"{final_dir}/{file_name}_frequency.csv")

#### n-grams

In [157]:
import re


def list_and_tokenize(data):
    return str(data).split()

In [158]:
# function to prepare n-grams
import collections

def count_n_grams(lines, min_length=2, max_length=4):
    lengths = range(min_length, max_length + 1)
    n_grams = {length: collections.Counter() for length in lengths}
    queue = collections.deque(maxlen=max_length)

    # Helper function to add n-grams at start of current queue to dict
    def add_queue():
        current = tuple(queue)
        for length in lengths:
            if len(current) >= length:
                n_grams[length][current[:length]] += 1

    # Loop through all lines and words and add n-grams to dict
    for line in lines:
        for word in list_and_tokenize(line):
            # if len(word) > 3:
            queue.append(word)
            if len(queue) >= max_length:
                add_queue()

    # Make sure we get the n-grams at the tail end of the queue
    while len(queue) > min_length:
        queue.popleft()
        add_queue()

    return n_grams

In [161]:
bigram_to_df = pd.DataFrame({'2-grams': [], '2-grams freq': []})
trigram_to_df = pd.DataFrame({'3-grams': [], '3-grams freq': []})
quadgram_to_df = pd.DataFrame({'4-grams': [], '4-grams freq': []})

bigram = {'2-grams': [], '2-grams freq': []}
trigram = {'3-grams': [], '3-grams freq': []}
quadgram = {'4-grams': [], '4-grams freq': []}
    
def print_most_freq_ng(n_grams, num=30):
    global bigram_to_df, trigram_to_df, quadgram_to_df
    for n in sorted(n_grams):
        for gram, count in n_grams[n].most_common(num):
            if n == 2:
                bigram['2-grams'].append(gram)
                bigram['2-grams freq'].append(count)
            elif n == 3:
                trigram['3-grams'].append(gram)
                trigram['3-grams freq'].append(count)
            else:
                quadgram['4-grams'].append(gram)
                quadgram['4-grams freq'].append(count)
                
    bigram_to_df = pd.DataFrame({'2-grams': bigram['2-grams'], '2-grams freq': bigram['2-grams freq']})
    trigram_to_df = pd.DataFrame({'3-grams': trigram['3-grams'], '3-grams freq': trigram['3-grams freq']})
    quadgram_to_df = pd.DataFrame({'4-grams': quadgram['4-grams'], '4-grams freq': quadgram['4-grams freq']})

In [162]:
print_most_freq_ng(count_n_grams(df['text']))
n_gram_df = pd.concat([bigram_to_df, trigram_to_df, quadgram_to_df], axis=1)

n_gram_df

Unnamed: 0,2-grams,2-grams freq,3-grams,3-grams freq,4-grams,4-grams freq
0,"(የአዲስ, አበባ)",40,"(የአዲስ, አበባ, ከተማ)",21,"(የአዲስ, አበባ, ከተማ, አስተዳደር)",12
1,"(ቲክቫህ, ኢትዮጵያ)",37,"(ኢትዮጵያ, ቤተሰብ, አባል)",16,"(የቲክቫህ, ኢትዮጵያ, ቤተሰብ, አባል)",10
2,"(አበባ, ከተማ)",30,"(አበባ, ከተማ, አስተዳደር)",14,"(ቤተሰብ, አባል, ነው, @tikvahethiopia)",10
3,"(ለቲክቫህ, ኢትዮጵያ)",25,"(ምን, አሉ, ?)",13,"(ኢትዮጵያ, ቤተሰብ, አባል, ነው)",9
4,"(ነው, @tikvahethiopia)",23,"(የቲክቫህ, ኢትዮጵያ, ቤተሰብ)",12,"(የመቐለ, የቲክቫህ, ኢትዮጵያ, ቤተሰብ)",6
5,"(ቤተሰብ, አባል)",20,"(ቤተሰብ, አባል, ነው)",11,"(ቲክቫህ, ኢትዮጵያ, ቤተሰብ, አባል)",6
6,"(ከተማ, አስተዳደር)",19,"(አባል, ነው, @tikvahethiopia)",10,"(ሰርቪስ, እና, የሰው, ሀብት)",6
7,"(ቀን, 2016)",18,"(ቴክኖ, ሞባይል, ኢትዮጵያ)",7,"(እና, የሰው, ሀብት, ልማት)",6
8,"(ኢትዮጵያ, ቤተሰብ)",18,"(የመቐለ, የቲክቫህ, ኢትዮጵያ)",7,"(የአ/አ, ቲክቫህ, ኢትዮጵያ, ቤተሰብ)",5
9,"(በአዲስ, አበባ)",16,"(ሰርቪስ, እና, የሰው)",7,"(አበባ, ከተማ, አስተዳደር, ፐብሊክ)",5


#### Save n-grams

In [163]:
n_gram_df.to_csv(f"{final_dir}/{file_name}_n_gram.csv")