# Pre-processing - 20 news dataset

## Importing libraries

In [3]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.matutils import Sparse2Corpus
from gensim import corpora, models
from collections import defaultdict
import re
from string import punctuation
import pickle

## Data fetching

In [2]:
categories = ['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc' ]

In [5]:
newsgroups = fetch_20newsgroups(subset='train')

In [4]:
newsgroup_body = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))

In [5]:
newsgroup_test_body = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'))

## Author Extraction

In [6]:
def extract_author(text):
    match = re.search(r"From: (.+?)(\n|$)", text)
    if match:
        author = match.group(1)
        return re.sub(r"[<>].*", "", author).strip()
    return "Unknown"

In [7]:
authors = [extract_author(doc) for doc in newsgroups.data]

In [8]:
author2doc = {}

for doc_id, author in enumerate(authors):
    if author not in author2doc:
        author2doc[author] = []
    author2doc[author].append(doc_id)

In [10]:
author2doc_path = "preprocessed_text/author2doc-train"

In [9]:
# Serialize and save to a file
with open(f"{author2doc_path}.pkl", 'wb') as f:
  pickle.dump(author2doc, f)

## Pre-processing

#### STOP WORDS creation
Initial stop words from `sklearn.feature_extraction._stop_words`

In [9]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

from sklearn.feature_extraction._stop_words import ENGLISH_STOP_WORDS # Total 318 words

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
nltk_stop_words = list(set(stopwords.words('english')))
sk_stop_words = list(ENGLISH_STOP_WORDS)

STOP_WORDS = list(set(nltk_stop_words + sk_stop_words))

In [11]:
STOP_WORDS += ["use", "think", "thanks", "know", "like", "make", "say", "time", "use", "need", "want", "come" ]

#### Lemmatizer
Lemmatizer minimizes text ambiguity. Example words like bicycle or bicycles are converted to base word bicycle. Basically, it will convert all words having the same meaning but different representation to their base form. It reduces the word density in the given text and helps in preparing the accurate features for training machine. Cleaner the data, the more intelligent and accurate your machine learning model, will be. NLTK Lemmatizer will also saves memory as well as computational cost.

In [12]:
import nltk
nltk.download('punkt')
from nltk.corpus import wordnet as wn
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
nltk.download('averaged_perceptron_tagger') # need for pos_tag

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Alam\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
# Initialization
lemmatizer = WordNetLemmatizer()

# Creating a POS tag map
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV

def convertWordIntoLemmatizeWord(words):
  return [lemmatizer.lemmatize(word, tag_map[tag[0]]) for word, tag in pos_tag(words)]

#### Creating a pre-process function
* Remove numbers. ✅
* Convert word into lowercase word. ✅
* Remove all stop words. ✅
* Remove all punctuations. ✅
* Some white spaces may be added to the list of words, due to the translate function & nature of our documents. Remove them as well. ✅
* Remove just-numeric strings. ✅
* Lemmatize. ✅
* Remove words with only 2 characters or less. [Low frequency] ✅
* Remove words with more than 12 characters. [High frequency] ✅

In [14]:
def preprocess(words):
  #First, remove numbers
  words = [re.sub(r"\d+", "", word) for word in words]
  #Normalize the cases of our words
  words = [word.lower() for word in words]
  #Remove all punctuations
  table = str.maketrans('', '', punctuation)
  words = [word.translate(table) for word in words]
  #Some white spaces may be added to the list of words, due to the translate function & nature of our documents. We've to remove them.
  words = [word for word in words if word]
  #Remove just-numeric strings
  words = [word for word in words if not word.isdigit()]
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Lemmatize
  words = convertWordIntoLemmatizeWord(words)
  #Remove all stop words
  words = [word for word in words if word not in STOP_WORDS]
  #Remove words with less than 3 characters and more than 20 characters
  words = [word for word in words if len(word) > 2 and len(word) <= 20]
  return words

In [15]:
def preprocess_documents(docs):
  preprocessed_docs = []
  for doc in docs:
    words = word_tokenize(doc)
    words = preprocess(words)
    # preprocessed_docs.append(" ".join(words))
    preprocessed_docs.append(words)
  return preprocessed_docs

In [16]:
preprocessed_docs = preprocess_documents(newsgroup_body.data)

In [17]:
preprocessed_test_docs = preprocess_documents(newsgroup_test_body.data)

Store and retrieve train data

In [18]:
preprocessed_text_path = f"preprocessed_text/train_data{len(preprocessed_docs)}"

In [19]:
# Serialize and save to a file
with open(f"{preprocessed_text_path}.pkl", 'wb') as f:
  pickle.dump(preprocessed_docs, f)

In [20]:
# Load the array back
with open(f"{preprocessed_text_path}.pkl", 'rb') as f:
  loaded_preprocessed_paper_text_list = pickle.load(f)

Store and retrieve test data

In [21]:
preprocessed_test_text_path = f"preprocessed_text/test_data{len(preprocessed_test_docs)}"

In [22]:
# Serialize and save to a file
with open(f"{preprocessed_test_text_path}.pkl", 'wb') as f:
  pickle.dump(preprocessed_test_docs, f)

In [23]:
dictionary = corpora.Dictionary(preprocessed_docs)

In [24]:
dictionary.filter_extremes(no_below=15, no_above=0.5)

In [25]:
bow_corpus = [dictionary.doc2bow(doc) for doc in preprocessed_docs]