# Data exploration
Techniques and tests over the text corpus for 20NewsGroup Dataset

In [1]:
import nltk
import string
import numpy as np
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import RSLPStemmer
from IPython.display import display, clear_output

In [2]:
# df_test = pd.read_csv("../datasets/20news-test.csv")
df_train = pd.read_csv("../datasets/20news-train-filtered.csv")

## NLTK Preprocessing

In [3]:
def to_lower(text:str)-> str:
    """ Get input sentence and returns all words as lowercase
    """
    return text.lower()

def remove_symbols(text:str)-> str:
    """ Get the input text and removes all symbol from english language
    TODO: Check if symbol is separing words and add a space
    TODO: REGEX
    """
    # text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'([!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~])', ' ', text)
    return text

def word_tokenize(text:str)->list:
    words = nltk.word_tokenize(text)
    return words

def remove_stopwords(words:list)->list:
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def remove_numbers(words:list)->list:
    words = [w for w in words if not w.isdecimal()]
    return words

def apply_stemming(words:list)->list:
    stemmer = RSLPStemmer()
    stemmed = [stemmer.stem(word) for word in words]
    return stemmed

def remove_single_letters(words:list)->list:
    words = [w for w in words if len(w) > 1]
    return words

def process_sentence(text:str,
                     process_symbols:bool = True,
                     process_stopwords:bool = True,
                     process_numbers:bool = True,
                     process_single_letters:bool = True)-> list:
    """ Get a raw sentence and applies all preprocessing stages
    """
    text = to_lower(text)
    
    if process_symbols:
        text = remove_symbols(text)
    
    words = word_tokenize(text)
    # print("without symbols", len(words))
    
    if process_stopwords:
        words = remove_stopwords(words)
    # print("without stopwords", len(words))
    
    if process_numbers:
        words = remove_numbers(words)
    # print("without numbers", len(words))
    
    if process_single_letters:
        words = remove_single_letters(words)
    
    words = apply_stemming(words)
    
    return words

def create_word_corpus(dataset:list,
                       min_percentage:float = 0.0,
                       max_percentage:float = 1.0)->tuple:
    """
    Creates a word_corpus selection based on a processed dataset.
    It also returns the term_count for the corpus, allowing for later document encoding
    """
    raw_word_corpus = [w for item in dataset for w in item]
    raw_word_corpus = list(set(raw_word_corpus))

    term_count = {w:0 for w in raw_word_corpus}
    for text in dataset:
        for word in text:
            term_count[word] += 1
    sorted_term_count = {k: v for k, v in sorted(term_count.items(), key=lambda item: item[1], reverse=True)}
    
    term_df = pd.DataFrame(list(sorted_term_count.items()), columns=["word", "count"])
    term_df["average_count"] = term_df["count"]/len(dataset)
    
    word_corpus_term_count = term_df[term_df["average_count"].between(min_percentage, max_percentage)]
    word_corpus = word_corpus_term_count["word"].to_list()

    return word_corpus, word_corpus_term_count

In [4]:
processed = []
total = len(df_train)
for index, row in df_train.iterrows():
    clear_output(wait=True)
    display(f"Processing {index+1}/{total}")
    processed.append(process_sentence(row["text"]))
len(processed)

'Processing 19997/19997'

19997

In [5]:
with open('../datasets/20-news-processed-no-singles.pickle', 'wb') as f:
    pickle.dump(processed, f, pickle.HIGHEST_PROTOCOL)

In [6]:
with open('../datasets/20-news-processed-no-singles.pickle', 'rb') as f:
    dataset = pickle.load(f)

In [7]:
len(dataset)

19997

In [11]:
word_corpus, word_df = create_word_corpus(dataset,
                       min_percentage = 0.01,
                       max_percentage = 999999)
word_df

Unnamed: 0,word,count,average_count
0,ax,62420,3.121468
1,one,16311,0.815672
2,would,15772,0.788718
3,edu,12098,0.604991
4,com,11281,0.564135
...,...,...,...
2422,controlled,200,0.010002
2423,helped,200,0.010002
2424,cart,200,0.010002
2425,orthodox,200,0.010002


In [40]:
word_df.to_csv("../datasets/20news-word-corpus-2k.csv", index=False)

In [39]:
word_df.shape

(2423, 3)

In [3]:
len(dataset)

19997

In [6]:
classes = pd.read_csv("../datasets/20news-categories.csv")

In [9]:
class_ids = classes.id.to_list()
train_classes = df_train.category_id.to_list()

In [6]:
raw_word_corpus = [w for item in dataset for w in item]
raw_word_corpus = list(set(raw_word_corpus))
len(raw_word_corpus)

128791

In [13]:
term_count = {w:[0 for i in class_ids] for w in raw_word_corpus}
len(term_count)
len(term_count[raw_word_corpus[0]])

20

In [14]:
len(term_count)

128791

In [15]:
for index, item in enumerate(dataset):
    for word in set(item):
        term_count[word][train_classes[index]] += 1

In [29]:
word = raw_word_corpus[0]
np.concatenate([[word], term_count[word], [np.sum(term_count[word])]], axis=0)

array(['injan', '0', '0', '0', '0', '1', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '1'], dtype='<U21')

In [31]:
["word"] + classes.name.to_list() + ["sum"]

['word',
 'alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc',
 'sum']

In [32]:
new_term_count = []
for word in term_count.keys():
    new_term_count.append(np.concatenate([[word], term_count[word], [np.sum(term_count[word])]], axis=0))
columns = ["word"] + classes.name.to_list() + ["sum"]
df = pd.DataFrame(new_term_count, columns=["word"] + classes.name.to_list() + ["sum"])
df

Unnamed: 0,word,alt.atheism,comp.graphics,comp.os.ms-windows.misc,comp.sys.ibm.pc.hardware,comp.sys.mac.hardware,comp.windows.x,misc.forsale,rec.autos,rec.motorcycles,...,sci.crypt,sci.electronics,sci.med,sci.space,soc.religion.christian,talk.politics.guns,talk.politics.mideast,talk.politics.misc,talk.religion.misc,sum
0,injan,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,plorpl,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,schak,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,aw6igiyxg,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,magellan,0,5,0,1,0,0,0,0,0,...,0,0,0,22,0,0,0,0,0,28
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
128786,sppech,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,5
128787,vryw,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
128788,tekbsp,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,2
128789,shabazz,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1


In [62]:
target = 100
word_corpus = []
for name in classes.name.to_list():
    sorted_df = df.sort_values(by=name, ascending=False)
    word_corpus.extend(sorted_df.iloc[0:target].word.to_list())
word_corpus = list(set(word_corpus))
filtered_df = df[df.word.isin(word_corpus)]
output_df = filtered_df[["word", "sum"]]
output_df.to_csv(f"../datasets/20-news-word-corpus-class-top-{target}.csv", index=False)

In [52]:
filtered_df = df[df.word.isin(word_corpus)]

In [54]:
filtered_df[["word", "sum"]]

Unnamed: 0,word,sum
92,pm,106
180,actually,1765
351,preventing,76
492,plain,222
643,according,641
...,...,...
126963,lz,17
127385,ave,119
127602,versu,93
127722,becom,942


In [None]:
sorted_term_count = {k: v for k, v in sorted(term_count.items(), key=lambda item: item[1], reverse=True)}
sorted_term_count

In [15]:
df = pd.DataFrame(list(sorted_term_count.items()), columns=["word", "count"])

In [17]:
df

Unnamed: 0,word,count
0,one,7785
1,would,7609
2,edu,6542
3,com,6470
4,lik,6250
...,...,...
128786,pabon,1
128787,tektonik,1
128788,fy7,1
128789,j0m,1


In [18]:
df["average_count"] = df["count"]/len(dataset)
df

Unnamed: 0,word,count,average_count
0,one,7785,0.389308
1,would,7609,0.380507
2,edu,6542,0.327149
3,com,6470,0.323549
4,lik,6250,0.312547
...,...,...,...
128786,pabon,1,0.000050
128787,tektonik,1,0.000050
128788,fy7,1,0.000050
128789,j0m,1,0.000050


In [20]:
df["count"].describe()

count    128791.000000
mean         14.646186
std         113.097351
min           1.000000
25%           1.000000
50%           1.000000
75%           3.000000
max        7785.000000
Name: count, dtype: float64

In [34]:
word_corpus = df[df["average_count"].between(0.02, 0.5)]
word_corpus.to_csv("../datasets/20-news-word-corpus-document-count-02.csv", index=False)
len(word_corpus)

839

In [24]:
with open('./datasets/word-corpus-term-count-numbers-removed.pickle', 'wb') as f:
    pickle.dump(sorted_term_count, f, pickle.HIGHEST_PROTOCOL)