## Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import display, HTML

## Read the Dataset

In [None]:
df = pd.read_csv('dataset/web-text-corpus/webtext.csv')

In [None]:
df

In [None]:
df.info()

### Drop Missing Value on Rows

In [None]:
no_null_df = df.dropna()

In [None]:
no_null_df.info()

In [None]:
cnt = Counter()
dictionary_domain = {}
for domain in no_null_df['domain']:
    cnt[domain] += 1

In [None]:
cnt

### We take 30 samples/sentences which including 15 data on Firefox class, 15 data on Overheard class

In [None]:
df_sample_firefox = no_null_df[no_null_df['domain'] == 'firefox'].iloc[:15,:]
df_sample_overheard = no_null_df[no_null_df['domain'] == 'overheard'].iloc[:15,:]
df_sample = pd.concat([df_sample_firefox, df_sample_overheard])

In [None]:
df_sample

In [None]:
df_sample.reset_index(drop=True)

## Preprocessing

### Case Folding

In [None]:
prepro_df = df_sample.copy()

In [None]:
prepro_df['text'] = prepro_df['text'].str.lower()

### Remove Punctuations, HTML, URL

In [None]:
def cleansing(document):
    #Define the Regex
    regex_html = '</?.*/?>'
    regex_url = '(https?://)|(https?:\/\/)?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*))'
    regex_punc = '[!\(\)-\[\]{};:"\'\,<>\./\?@#$%^&*_~]'
    regex_space = '\s{2,}'
    
    #Replace if match with Regex pattern
    document['text'] = document['text'].str.replace(regex_html, ' ')
    document['text'] = document['text'].str.replace(regex_url, ' ')
    document['text'] = document['text'].str.replace(regex_punc, '')
    document['text'] = document['text'].str.replace(regex_space, ' ')
    
    #Reset index
    document = document.reset_index(drop=True)
    
    return document

In [None]:
cleansing_df = cleansing(prepro_df)

In [None]:
cleansing_df

In [None]:
df_sample

In [None]:
prepro_df.info()

### Remove Stopwords
Some references I found according to Stopwords Analysis:
1. https://www.sciencedirect.com/science/article/pii/S1877050914013799
2. https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html
3. https://medium.com/@jasoncrease/zipf-54912d5651cc
4. https://medium.com/@devalshah1619/a-mysterious-law-so-simple-and-yet-so-universal-aa9f1c8903d1

## Using Zipf Law

In [None]:
vectorizer = CountVectorizer()

In [None]:
X = vectorizer.fit_transform(cleansing_df['text'])

In [None]:
print(vectorizer.get_feature_names())
print('\n{} terms'.format(len(vectorizer.get_feature_names())))

In [None]:
np.set_printoptions(threshold=np.nan)

In [None]:
cleansing_df['text'][0]

In [None]:
#Rows indicate the documents, while column indicate the term.
X.toarray()

### Count total occurences of each Term in Document

In [None]:
def count_idf(corpus):
    #init variable
    term_idf = []
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    term = vectorizer.get_feature_names()
    term_array = X.toarray()
    
    #count IDF (Inverse Document Frequency) for each Term
    for i in range(len(term)):
        idf = sum([1 for tf in term_array[:, i] if tf >= 1])
        term_idf.append((term[i], idf))
    return term_idf

In [None]:
def count_occurences(corpus):
    #init variable
    term_occurences = []
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(corpus)
    term = vectorizer.get_feature_names()
    term_array = X.toarray()
    
    #count total term on each doc
    for i in range(len(term)):
        count_term = sum(term_array[:, i])
        term_occurences.append((term[i], count_term))
    return term_occurences

In [None]:
occurences = count_occurences(cleansing_df['text'])

In [None]:
idf_each_term = count_idf(cleansing_df['text'])

In [None]:
idf_each_term.sort(reverse=True, key=lambda x: x[1])

In [None]:
idf_each_term

In [None]:
occurences

In [None]:
occurences.sort(reverse=True, key=lambda x: x[1])

In [None]:
key_occurences = [ item[0] for item in occurences]
value_occurences = [ item[1] for item in occurences]

In [None]:
key_occurences

In [None]:
def plot_zipf(terms: [], freq: []):
    fig = plt.figure(figsize=(40,20))
    plt.plot(terms, freq)
    plt.xticks(terms, rotation='vertical')
    plt.xlabel('Terms')
    plt.ylabel('Frequency')
    plt.show()

In [None]:
plot_zipf(key_occurences, value_occurences)

In [None]:
occurences_df = pd.DataFrame(occurences, columns=['terms', 'freq'])

In [None]:
occurences_df

In [None]:
idf_df = pd.DataFrame(idf_each_term, columns=['terms', 'idf'])

In [None]:
idf_df

### Based on Paper....
There are several methods on removing Stopwords, in Zipf's Law (Z-Methods):
Three stopword creation methods are used in addition to the classic stoplist. This includes removing most frequent words (TF-High), removing words that occur once, i.e., singleton words (TF1), and removing words with low inverse document frequency (IDF) (Jashanjot, Buttar, 2018).

In [None]:
def filter_freq_zipf(dataframe_tf, dataframe_idf, threshold_idf):
    most_freq = max(dataframe_tf['freq'])
    singleton_word = 1
    filter_tf = dataframe_tf[(dataframe_tf['freq'] == most_freq) | (dataframe_tf['freq'] == singleton_word)]
    filter_idf = dataframe_idf[(dataframe_idf['idf'] <= threshold_idf)]
    print(filter_tf)
    print(filter_idf)
    stopwords = np.concatenate((filter_tf['terms'], filter_idf['terms']))
    return list(set(stopwords))

In [None]:
filter_freq_zipf(occurences_df, idf_df, 3)