# 0.2.0 Simple topic identification

In [2]:
%load_ext autoreload
%autoreload 2

In [18]:
import pandas as pd
from matplotlib import pyplot as plt
import re

from nltk.tokenize import sent_tokenize, word_tokenize, regexp_tokenize, TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings("ignore")

In [7]:
import sys
sys.path.append("../") 

from utils.info import article
import utils.paths as path
from utils.paths2 import direcciones, direcciones_cursos

## 0.2.2 Bag-of-words picker

In [5]:
Counter(word_tokenize("""The cat is in the box. The cat box."""))

Counter({'The': 2, 'cat': 2, 'is': 1, 'in': 1, 'the': 1, 'box': 2, '.': 2})

## 0.2.3 Building a Counter with bag-of-words

In [9]:
# Tokenize the article: tokens
tokens = word_tokenize(article)
tokens

["''",
 "'Debugging",
 "''",
 "'",
 'is',
 'the',
 'process',
 'of',
 'finding',
 'and',
 'resolving',
 'of',
 'defects',
 'that',
 'prevent',
 'correct',
 'operation',
 'of',
 'computer',
 'software',
 'or',
 'a',
 'system',
 '.',
 'Numerous',
 'books',
 'have',
 'been',
 'written',
 'about',
 'debugging',
 '(',
 'see',
 'below',
 ':',
 '#',
 'Further',
 'reading|Further',
 'reading',
 ')',
 ',',
 'as',
 'it',
 'involves',
 'numerous',
 'aspects',
 ',',
 'including',
 'interactive',
 'debugging',
 ',',
 'control',
 'flow',
 ',',
 'integration',
 'testing',
 ',',
 'Logfile|log',
 'files',
 ',',
 'monitoring',
 '(',
 'Application',
 'monitoring|application',
 ',',
 'System',
 'Monitoring|system',
 ')',
 ',',
 'memory',
 'dumps',
 ',',
 'Profiling',
 '(',
 'computer',
 'programming',
 ')',
 '|profiling',
 ',',
 'Statistical',
 'Process',
 'Control',
 ',',
 'and',
 'special',
 'design',
 'tactics',
 'to',
 'improve',
 'detection',
 'while',
 'simplifying',
 'changes',
 '.',
 'Origin',
 'A

In [13]:
# Convert the tokens into lowercase: lower_tokens
lower_tokens = [t.lower() for t in tokens]
lower_tokens

["''",
 "'debugging",
 "''",
 "'",
 'is',
 'the',
 'process',
 'of',
 'finding',
 'and',
 'resolving',
 'of',
 'defects',
 'that',
 'prevent',
 'correct',
 'operation',
 'of',
 'computer',
 'software',
 'or',
 'a',
 'system',
 '.',
 'numerous',
 'books',
 'have',
 'been',
 'written',
 'about',
 'debugging',
 '(',
 'see',
 'below',
 ':',
 '#',
 'further',
 'reading|further',
 'reading',
 ')',
 ',',
 'as',
 'it',
 'involves',
 'numerous',
 'aspects',
 ',',
 'including',
 'interactive',
 'debugging',
 ',',
 'control',
 'flow',
 ',',
 'integration',
 'testing',
 ',',
 'logfile|log',
 'files',
 ',',
 'monitoring',
 '(',
 'application',
 'monitoring|application',
 ',',
 'system',
 'monitoring|system',
 ')',
 ',',
 'memory',
 'dumps',
 ',',
 'profiling',
 '(',
 'computer',
 'programming',
 ')',
 '|profiling',
 ',',
 'statistical',
 'process',
 'control',
 ',',
 'and',
 'special',
 'design',
 'tactics',
 'to',
 'improve',
 'detection',
 'while',
 'simplifying',
 'changes',
 '.',
 'origin',
 'a

In [14]:
# Create a Counter with the lowercase tokens: bow_simple
bow_simple = Counter(lower_tokens)
bow_simple

Counter({"''": 69,
         "'debugging": 1,
         "'": 2,
         'is': 25,
         'the': 150,
         'process': 12,
         'of': 81,
         'finding': 1,
         'and': 41,
         'resolving': 1,
         'defects': 3,
         'that': 14,
         'prevent': 1,
         'correct': 1,
         'operation': 2,
         'computer': 12,
         'software': 16,
         'or': 25,
         'a': 60,
         'system': 19,
         '.': 89,
         'numerous': 2,
         'books': 1,
         'have': 7,
         'been': 3,
         'written': 1,
         'about': 3,
         'debugging': 39,
         '(': 40,
         'see': 5,
         'below': 1,
         ':': 31,
         '#': 1,
         'further': 1,
         'reading|further': 1,
         'reading': 1,
         ')': 40,
         ',': 151,
         'as': 21,
         'it': 18,
         'involves': 1,
         'aspects': 1,
         'including': 1,
         'interactive': 3,
         'control': 4,
         'flow': 2,
  

In [15]:
# Print the 10 most common tokens
print(bow_simple.most_common(10))

[(',', 151), ('the', 150), ('.', 89), ('of', 81), ("''", 69), ('to', 63), ('a', 60), ('``', 47), ('in', 44), ('and', 41)]


## 0.2.4 Text preprocessing steps

Which of the following are useful text preprocessing steps?

R:/ Lemmatization, lowercasing, removing unwanted tokens.

## 0.2.5 Text preprocessing practice

In [17]:
# Retain alphabetic words: alpha_only
alpha_only = [t for t in lower_tokens if t.isalpha()]
alpha_only

['is',
 'the',
 'process',
 'of',
 'finding',
 'and',
 'resolving',
 'of',
 'defects',
 'that',
 'prevent',
 'correct',
 'operation',
 'of',
 'computer',
 'software',
 'or',
 'a',
 'system',
 'numerous',
 'books',
 'have',
 'been',
 'written',
 'about',
 'debugging',
 'see',
 'below',
 'further',
 'reading',
 'as',
 'it',
 'involves',
 'numerous',
 'aspects',
 'including',
 'interactive',
 'debugging',
 'control',
 'flow',
 'integration',
 'testing',
 'files',
 'monitoring',
 'application',
 'system',
 'memory',
 'dumps',
 'profiling',
 'computer',
 'programming',
 'statistical',
 'process',
 'control',
 'and',
 'special',
 'design',
 'tactics',
 'to',
 'improve',
 'detection',
 'while',
 'simplifying',
 'changes',
 'origin',
 'a',
 'computer',
 'log',
 'entry',
 'from',
 'the',
 'mark',
 'nbsp',
 'ii',
 'with',
 'a',
 'moth',
 'taped',
 'to',
 'the',
 'page',
 'the',
 'terms',
 'bug',
 'and',
 'debugging',
 'are',
 'popularly',
 'attributed',
 'to',
 'admiral',
 'grace',
 'hopper',
 '

In [19]:
# Remove all stop words: no_stops
no_stops = [t for t in alpha_only if t not in stopwords.words('english')]
no_stops

['process',
 'finding',
 'resolving',
 'defects',
 'prevent',
 'correct',
 'operation',
 'computer',
 'software',
 'system',
 'numerous',
 'books',
 'written',
 'debugging',
 'see',
 'reading',
 'involves',
 'numerous',
 'aspects',
 'including',
 'interactive',
 'debugging',
 'control',
 'flow',
 'integration',
 'testing',
 'files',
 'monitoring',
 'application',
 'system',
 'memory',
 'dumps',
 'profiling',
 'computer',
 'programming',
 'statistical',
 'process',
 'control',
 'special',
 'design',
 'tactics',
 'improve',
 'detection',
 'simplifying',
 'changes',
 'origin',
 'computer',
 'log',
 'entry',
 'mark',
 'nbsp',
 'ii',
 'moth',
 'taped',
 'page',
 'terms',
 'bug',
 'debugging',
 'popularly',
 'attributed',
 'admiral',
 'grace',
 'hopper',
 'http',
 'grace',
 'hopper',
 'foldoc',
 'working',
 'harvard',
 'mark',
 'ii',
 'computer',
 'harvard',
 'university',
 'associates',
 'discovered',
 'moth',
 'stuck',
 'relay',
 'thereby',
 'impeding',
 'operation',
 'whereupon',
 'remark

In [24]:
# Instantiate the WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Lemmatize all tokens into a new list: lemmatized
lemmatized = [wordnet_lemmatizer.lemmatize(t) for t in no_stops]
lemmatized

['process',
 'finding',
 'resolving',
 'defect',
 'prevent',
 'correct',
 'operation',
 'computer',
 'software',
 'system',
 'numerous',
 'book',
 'written',
 'debugging',
 'see',
 'reading',
 'involves',
 'numerous',
 'aspect',
 'including',
 'interactive',
 'debugging',
 'control',
 'flow',
 'integration',
 'testing',
 'file',
 'monitoring',
 'application',
 'system',
 'memory',
 'dump',
 'profiling',
 'computer',
 'programming',
 'statistical',
 'process',
 'control',
 'special',
 'design',
 'tactic',
 'improve',
 'detection',
 'simplifying',
 'change',
 'origin',
 'computer',
 'log',
 'entry',
 'mark',
 'nbsp',
 'ii',
 'moth',
 'taped',
 'page',
 'term',
 'bug',
 'debugging',
 'popularly',
 'attributed',
 'admiral',
 'grace',
 'hopper',
 'http',
 'grace',
 'hopper',
 'foldoc',
 'working',
 'harvard',
 'mark',
 'ii',
 'computer',
 'harvard',
 'university',
 'associate',
 'discovered',
 'moth',
 'stuck',
 'relay',
 'thereby',
 'impeding',
 'operation',
 'whereupon',
 'remarked',
 'de

In [27]:
# Create the bag-of-words: bow
bow = Counter(lemmatized)
bow

Counter({'process': 13,
         'finding': 1,
         'resolving': 1,
         'defect': 4,
         'prevent': 1,
         'correct': 1,
         'operation': 2,
         'computer': 14,
         'software': 16,
         'system': 25,
         'numerous': 2,
         'book': 2,
         'written': 1,
         'debugging': 39,
         'see': 5,
         'reading': 1,
         'involves': 1,
         'aspect': 1,
         'including': 1,
         'interactive': 3,
         'control': 4,
         'flow': 2,
         'integration': 1,
         'testing': 4,
         'file': 3,
         'monitoring': 1,
         'application': 2,
         'memory': 5,
         'dump': 4,
         'profiling': 1,
         'programming': 9,
         'statistical': 1,
         'special': 2,
         'design': 4,
         'tactic': 1,
         'improve': 1,
         'detection': 2,
         'simplifying': 1,
         'change': 5,
         'origin': 2,
         'log': 1,
         'entry': 2,
         'mark':

In [33]:
print('ok_')

ok_
