## Import necessary libs

In [1]:
import re
import unicodedata as ud
import numpy as np
import math
import collections
from string import punctuation
from nltk import ngrams
from nltk import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
from nltk.stem import PorterStemmer, ISRIStemmer
import json
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from collections import Counter
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Read parameters's input

In [2]:
typeofclass = 1
lang = "en"
n = 1
c_path = "./corpora"
c_path = os.path.abspath(c_path)
# typeofclass = int(
#     input("to use word based classification use 1, for character based one use 2: "))
# lang = input("enter en for english corpus, ar for the arabic one: ")
# n = int(input("Enter the number of ngrams: "))

### configure tokenizers, stemmers, stopwords based on parameters's values

In [3]:

if lang == "en":
    c_path += "/english"
    stopwords = stopwords.words("english")
    stemer = PorterStemmer()
else:
    c_path += "/arabic"
    stopwords = stopwords.words("arabic")
    stemer = ISRIStemmer()


stoplist = set(stopwords + list(punctuation))

retoken = RegexpTokenizer(r'\w+|\$[\d\.]+|\S+')

pattern = re.compile(r'^([0-9]+\w*)')
if lang == "ar":
    pattern = re.compile(r'^([0-9]+\w*)|[a-zA-Z0-9]+')

### preprocessing function

In [4]:
def preprocess_text(text: str):
    text = ''.join(c for c in text if not ud.category(c).startswith('P'))
    text = ' '.join(retoken.tokenize(text))
    tokens = [
        token for token in nltk.word_tokenize(
            text)
        if token.lower() not in stoplist
        and
        not token.lower().isdigit()
        and
        pattern.match(token) == None
    ]
    tokens = [stemer.stem(w) for w in tokens]
    if typeofclass == 2:
        tokens = [c for w in tokens for c in w]
    return tokens

### Reading corpus's files

In [5]:
corpus = PlaintextCorpusReader(c_path, ".*")

### getting ngrams -> frequency dictionary for each corpus file

In [6]:
corpus_ngrams = []
filesids = corpus._fileids
dictNgrams = {}
for f in filesids:
    file_sentences = corpus.sents(f)
    corpus_ngrams_perfile = []
    for sent in file_sentences:
        sent_words = preprocess_text(" ".join(sent))
        sent_n_grams = ngrams(sent_words, n)

        corpus_ngrams.append(list(sent_n_grams))
        corpus_ngrams_perfile += [w for w in corpus_ngrams[-1]]

    frq_dist_f = nltk.FreqDist(corpus_ngrams_perfile)
    temp_dict = {" ".join(k): v for k, v in dict(frq_dist_f).items()}
    temp_dict = collections.OrderedDict(sorted(temp_dict.items()))
    dictNgrams[f] = temp_dict

### Transform ngrams's dictionary into a pandas dataFrame

In [7]:
tf_idf = pd.DataFrame(dictNgrams)
tf_idf = tf_idf.sort_index()
tf_idf = tf_idf.fillna(0)

### save frequencies

In [8]:
tf_idf.to_csv("ngrams.csv")

### Calculating idf and Tf.idf score for each corpus file

In [10]:
tf_idf["idf"] = tf_idf.apply(lambda row: math.log10(
    len(row)/(len(row)-len([w for w in row if w == 0]))), axis=1)
for f in filesids:
    tf_idf[f] = (1 + np.log10(tf_idf[f]))*tf_idf["idf"]
tf_idf = tf_idf.replace([np.inf, -np.inf], 0)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [11]:
tf_idf.head(10)

Unnamed: 0,corp1.txt,corp10.txt,corp2.txt,corp3.txt,corp4.txt,corp5.txt,corp6.txt,corp7.txt,corp8.txt,corp9.txt,idf
abl find,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363,0.0,0.740363
aboard next,0.0,0.740363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363
absenc year,0.0,0.740363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363
access problem,0.740363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363
accord research,0.903997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.564271,0.564271
accord south,0.0,0.0,0.0,0.439333,0.439333,0.439333,0.0,0.0,0.0,0.0,0.439333
accord studi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363,0.740363
across tropic,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363,0.740363
across world,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.564271,0.564271,0.564271
action need,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.740363,0.740363


### Save Tf-Idf into a csv

In [None]:
tf_idf.to_csv("tf_idf.csv")