In [1]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.probability import FreqDist
import string

In [2]:
from sklearn.datasets import fetch_20newsgroups

In [3]:
text_data = fetch_20newsgroups()
raw_data = text_data.data
len(raw_data)

11314

In [4]:
raw_data[0]

"From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"

Lowercasing

In [5]:
# Lowercasing

raw_data_lowercase = [str.lower(words) for words in raw_data]
raw_data_lowercase[0]

"from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?\nnntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day. it was a 2-door sports car, looked to be from the late 60s/\nearly 70s. it was called a bricklin. the doors were really small. in addition,\nthe front bumper was separate from the rest of the body. this is \nall i know. if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nthanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----\n\n\n\n\n"

Tokenization

In [6]:
# Tokenization
nltk.download('punkt')
raw_data_sent = []

for item in raw_data_lowercase:
  raw_data_sent.append(sent_tokenize(item))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [7]:
raw_data_sent[0]

["from: lerxst@wam.umd.edu (where's my thing)\nsubject: what car is this!?",
 'nntp-posting-host: rac3.wam.umd.edu\norganization: university of maryland, college park\nlines: 15\n\n i was wondering if anyone out there could enlighten me on this car i saw\nthe other day.',
 'it was a 2-door sports car, looked to be from the late 60s/\nearly 70s.',
 'it was called a bricklin.',
 'the doors were really small.',
 'in addition,\nthe front bumper was separate from the rest of the body.',
 'this is \nall i know.',
 'if anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.',
 'thanks,\n- il\n   ---- brought to you by your neighborhood lerxst ----']

In [8]:
# Tokenization
nltk.download('punkt')
raw_data_token = []

for item in raw_data_lowercase:
  raw_data_token.append(word_tokenize(item))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
raw_data_token[0][:20]

['from',
 ':',
 'lerxst',
 '@',
 'wam.umd.edu',
 '(',
 'where',
 "'s",
 'my',
 'thing',
 ')',
 'subject',
 ':',
 'what',
 'car',
 'is',
 'this',
 '!',
 '?',
 'nntp-posting-host']

Removing Punctuation and Numbers

In [10]:
# Removing Punctuation and Numbers

raw_data_Punc = []

for item in raw_data_token:
  words = []
  for word in item:
    if word.isalpha():
      words.append(word)
  raw_data_Punc.append(words)

raw_data_Punc[0][:20]

['from',
 'lerxst',
 'where',
 'my',
 'thing',
 'subject',
 'what',
 'car',
 'is',
 'this',
 'organization',
 'university',
 'of',
 'maryland',
 'college',
 'park',
 'lines',
 'i',
 'was',
 'wondering']

Stopword Removal

In [11]:
# Stopword Removal
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

raw_data_stopword = []

for item in raw_data_Punc:
  words = []
  for word in item:
    if word not in stop_words:
      words.append(word)
  raw_data_stopword.append(words)

raw_data_stopword[0][:20]

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


['lerxst',
 'thing',
 'subject',
 'car',
 'organization',
 'university',
 'maryland',
 'college',
 'park',
 'lines',
 'wondering',
 'anyone',
 'could',
 'enlighten',
 'car',
 'saw',
 'day',
 'sports',
 'car',
 'looked']

Lemmatization and Stemming

In [12]:
# Lemmatization
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...


In [13]:
# Stemming
stemmer = PorterStemmer()

In [14]:
print("Stem", stemmer.stem("feet"))
print("lemmat", lemmatizer.lemmatize("feet"))

Stem feet
lemmat foot


In [15]:
print("Stem", stemmer.stem("went"))
print("lemmat", lemmatizer.lemmatize("went"))
print("lemmat", lemmatizer.lemmatize("went", "v"))

Stem went
lemmat went
lemmat go


In [16]:
# Lemmatization

raw_data_lemmat = []

for item in raw_data_stopword:
  words = []
  for word in item:
      words.append(lemmatizer.lemmatize(word))
  raw_data_lemmat.append(words)

In [17]:
# Stemming

raw_data_stemm = []

for item in raw_data_lemmat:
  words = []
  for word in item:
      words.append(stemmer.stem(word))
  raw_data_stemm.append(words)

In [18]:
raw_data_stemm[0]

['lerxst',
 'thing',
 'subject',
 'car',
 'organ',
 'univers',
 'maryland',
 'colleg',
 'park',
 'line',
 'wonder',
 'anyon',
 'could',
 'enlighten',
 'car',
 'saw',
 'day',
 'sport',
 'car',
 'look',
 'late',
 'earli',
 'call',
 'bricklin',
 'door',
 'realli',
 'small',
 'addit',
 'front',
 'bumper',
 'separ',
 'rest',
 'bodi',
 'know',
 'anyon',
 'tellm',
 'model',
 'name',
 'engin',
 'spec',
 'year',
 'product',
 'car',
 'made',
 'histori',
 'whatev',
 'info',
 'funki',
 'look',
 'car',
 'pleas',
 'thank',
 'il',
 'brought',
 'neighborhood',
 'lerxst']

Frequency Distribution

In [19]:
from itertools import chain

raw_data_stemm_flattern = list(chain.from_iterable(raw_data_stemm))

In [20]:

fdist = FreqDist(raw_data_stemm_flattern)
fdist

FreqDist({'line': 12961, 'subject': 12432, 'organ': 11691, 'would': 9492, 'one': 9268, 'use': 8828, 'write': 8768, 'articl': 7017, 'like': 6499, 'get': 6304, ...})