# Data exploration
Techniques and tests over the text corpus for 20NewsGroup Dataset

In [1]:
import nltk
import string
import pandas as pd
import re
import pickle
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import RSLPStemmer
from IPython.display import display, clear_output

In [2]:
df_test = pd.read_csv("datasets/20news-test.csv")
df_train = pd.read_csv("datasets/20news-train-filtered.csv")

In [3]:
df_train_clean = df_train[~df_train.id.isin(df_test.id.to_list())]
print(len(df_train), len(df_test), len(df_train_clean))

19997 2000 17287


In [3]:
text = df_test.iloc[0]["text"]

## NLTK Preprocessing

In [22]:
def to_lower(text:str)-> str:
    """ Get input sentence and returns all words as lowercase
    """
    return text.lower()

def remove_symbols(text:str)-> str:
    """ Get the input text and removes all symbol from english language
    TODO: Check if symbol is separing words and add a space
    TODO: REGEX
    """
    # text = "".join([char for char in text if char not in string.punctuation])
    text = re.sub(r'([!"#$%&\'()*+,-./:;<=>?@\[\\\]^_`{|}~])', ' ', text)
    return text

def word_tokenize(text:str)->list:
    words = nltk.word_tokenize(text)
    return words

def remove_stopwords(words:list)->list:
    stop_words = stopwords.words('english')
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words

def remove_numbers(words:list)->list:
    words = [w for w in words if not w.isdecimal()]
    return words

def apply_stemming(words:list)->list:
    stemmer = RSLPStemmer()
    stemmed = [stemmer.stem(word) for word in words]
    return stemmed

def remove_single_letters(words:list)->list:
    words = [w for w in words if len(w) > 1]
    return words

def process_sentence(text:str,
                     process_symbols:bool = True,
                     process_stopwords:bool = True,
                     process_numbers:bool = True,
                     process_single_letters:bool = True)-> list:
    """ Get a raw sentence and applies all preprocessing stages
    """
    text = to_lower(text)
    
    if process_symbols:
        text = remove_symbols(text)
    
    words = word_tokenize(text)
    # print("without symbols", len(words))
    
    if process_stopwords:
        words = remove_stopwords(words)
    # print("without stopwords", len(words))
    
    if process_numbers:
        words = remove_numbers(words)
    # print("without numbers", len(words))
    
    if process_single_letters:
        words = remove_single_letters(words)
    
    words = apply_stemming(words)
    
    return words


In [23]:
processed = []
total = len(df_train)
for index, row in df_train.iterrows():
    clear_output(wait=True)
    display(f"Processing {index+1}/{total}")
    processed.append(process_sentence(row["text"]))
len(processed)

'Processing 19997/19997'

19997

In [25]:
with open('./datasets/20-news-processed-no-singles.pickle', 'wb') as f:
    pickle.dump(processed, f, pickle.HIGHEST_PROTOCOL)

In [26]:
with open('./datasets/20-news-processed-no-singles.pickle', 'rb') as f:
    dataset = pickle.load(f)

In [11]:
len(dataset)

19997

In [27]:
raw_word_corpus = [w for item in dataset for w in item]
raw_word_corpus = list(set(word_corpus))
len(raw_word_corpus)

128791

In [28]:
term_count = {w:0 for w in raw_word_corpus}
len(term_count)

128791

In [29]:
for text in dataset:
    for word in text:
        term_count[word] += 1

In [30]:
sorted_term_count = {k: v for k, v in sorted(term_count.items(), key=lambda item: item[1], reverse=True)}
sorted_term_count

TypeError: unhashable type: 'slice'

In [34]:
df = pd.DataFrame(list(sorted_term_count.items()), columns=["word", "count"])

In [66]:
type(df)

pandas.core.frame.DataFrame

In [40]:
df["average_count"] = df["count"]/len(dataset)
df

Unnamed: 0,word,count,average_count
0,ax,62420,3.121468
1,one,16311,0.815672
2,would,15772,0.788718
3,edu,12098,0.604991
4,com,11281,0.564135
...,...,...,...
128786,aset,1,0.000050
128787,d6f1q,1,0.000050
128788,lljc1jd0,1,0.000050
128789,mrftzrm,1,0.000050


In [65]:
word_corpus = df[df["average_count"].between(0.02, 0.6)]
word_corpus.to_csv("./datasets/20-news-word-corpus.csv", index=False)

In [24]:
with open('./datasets/word-corpus-term-count-numbers-removed.pickle', 'wb') as f:
    pickle.dump(sorted_term_count, f, pickle.HIGHEST_PROTOCOL)