# TP2: Text Pre-processing
__Solution by Ayoub Nainia__

Downloads

In [None]:
import nltk
nltk.download('webtext')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package webtext to /root/nltk_data...
[nltk_data]   Package webtext is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.corpus import webtext
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import punkt

In [None]:
punkt

<module 'nltk.tokenize.punkt' from '/usr/local/lib/python3.7/dist-packages/nltk/tokenize/punkt.py'>

## 1. Store in a four column dataframe, the name of the document, the number of stop words, tokens and uppercase tokens for each document. Show First 5 and last 5 lines of the dataframe.

In [None]:
# Initializations
textFiles = webtext.fileids()
engStopWords = stopwords.words('english')

In [None]:
def getCorpusVals(corpus):
  rawTextCorpus = [webtext.raw(text) for text in corpus]
  outList = []
  for i in range(len(rawTextCorpus)):
    docRow = []
    tokenizedText = word_tokenize(rawTextCorpus[i])
    # Doc name
    docRow.append(corpus[i])

    # Number of stops words
    allStopWords = [i for i in tokenizedText if i in engStopWords]
    nbStopWords = len(allStopWords)
    docRow.append(nbStopWords)

    # Number of tokens
    tokensList = [i for i in tokenizedText]
    nbTokens = len(tokensList)
    docRow.append(nbTokens)

    # Number of uppercased tokens
    upperTokens = [i for i in tokenizedText if i.isupper()]
    nbUpperTokens = len(upperTokens)
    docRow.append(nbUpperTokens)

    outList.append(docRow)

  return outList


In [None]:
import pandas as pd

# Display dataframe
def displayDataFrame(corpus):
  df = pd.DataFrame(getCorpusVals(textFiles))
  df.columns = ['Document', 'Stop words', 'Tokens', 'Uppercased Tokens']
  
  return df

__Displaying the Dataframe__

In [None]:
displayDataFrame(textFiles)

Unnamed: 0,Document,Stop words,Tokens,Uppercased Tokens
0,firefox.txt,25050,96056,2079
1,grail.txt,3308,16450,1760
2,overheard.txt,55592,210057,7456
3,pirates.txt,5790,21767,1771
4,singles.txt,638,4465,410
5,wine.txt,7901,31140,676


## 2. Eliminate stop words and transform all upper case letters into lower case letters.

The eliminate function bellow will apply both requested operattions (eliminating stop words and uppercase letters). For all texts

In [None]:
def eliminate(files, engStopWords):
  rawText = [webtext.raw(file) for file in files]

  finalText = []
  for i in rawText:
    newdocText = []
    tokenizedText = word_tokenize(i)
    
    # Eliminate stop words
    eliminated = [j for j in tokenizedText if j.lower() not in engStopWords]

    # To lowercase
    newdocText = [k.lower() for k in eliminated]

    finalText.append(' '.join(newdocText))

  return finalText

In [None]:
# executing the eliminate function
eliminatedText = eliminate(textFiles, engStopWords) 

In [None]:
eliminatedText
print(len(eliminatedText))

6


In [None]:
eliminatedText

 "pirates carribean : dead man 's chest , ted elliott & terry rossio [ view looking straight rolling swells , sound wind thunder , low heartbeat ] scene : port royal [ teacups table rain ] [ sheet music music stands rain ] [ bouquet white orchids , elizabeth sitting rain holding bouquet ] [ men rowing , men horseback , sound thunder ] [ eitc logo flag blowing wind ] [ many rowboats entering harbor ] [ elizabeth sitting alone , distance ] [ marines running , kick door ] [ mule seen left barn marines enter ] [ liz looking shoulder ] [ elizabeth drops bouquet ] [ manacles , escorted red coats ] elizabeth swann : ... ! [ elizabeth runs ] elizabeth swann : happening ? turner : n't know . look beautiful . elizabeth swann : think 's bad luck groom see bride wedding . [ marines cross long axes bar governor entering ] [ beckett , white hair curls , standing mercer ] lord cutler beckett : governor weatherby swann , 's long . lord cutler beckett : lord ... actually . lord cutler beckett : fact , 

Bellow we seperated the above question so that the break down the execution to 2 operation: removing stop words first, then removing uppercased letters

In [None]:
def noStopWords(files):
  rawText = [webtext.raw(file) for file in files]

  finalText = []
  for i in rawText:
    eliminated = []
    tokenizedText = word_tokenize(i)
    # Eliminate stop words
    eliminated = [j for j in tokenizedText if j.lower() not in engStopWords]
 
    finalText.append(eliminated)
 
  return finalText 

def lowercasing(files):
  # returned = noStopWords(files)
  lwrAll = []
  for i in files:
    lwr = []
    # tokenizedText = word_tokenize(i)
    lwr = [j.lower() for j in i]

    lwrAll.append(lwr)

  return lwrAll



## 3. Find the 10 most frequent tokens using the most_common method of the Counter

In [None]:
from collections import Counter

def mostFrequent(files):
  # rawText = [webtext.raw(file) for file in files]
  mostCommon = []
  for i in files:
    tokenized = word_tokenize(i)
    c = Counter(tokenized)
    mostCommon.append(c.most_common(10))

  dfMostCommon = pd.DataFrame(mostCommon)
  dfMostCommon = dfMostCommon.transpose()
  dfMostCommon.columns = ['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']
  dfMostCommon = dfMostCommon.transpose()

  return dfMostCommon

In [None]:
mostFrequent(eliminatedText)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
firefox.txt,"('', 1330)","(., 1021)","(n't, 944)","(``, 919)","((, 878)","(page, 866)","(firefox, 850)","(), 799)","(:, 721)","(window, 611)"
grail.txt,"(:, 1197)","(., 838)","(!, 830)","(,, 746)","([, 320)","(], 320)","(arthur, 261)","(?, 223)","(--, 151)","('s, 144)"
overheard.txt,"(:, 11501)","(., 10727)","(,, 8722)","(?, 4177)","(#, 3580)","(!, 3042)","(girl, 2950)","(guy, 2726)","('s, 2586)","(n't, 2304)"
pirates.txt,"(:, 916)","(,, 855)","(., 802)","([, 643)","(], 642)","(jack, 469)","(!, 430)","(?, 232)","(sparrow, 227)","('s, 226)"
singles.txt,"(,, 554)","(., 312)","(lady, 88)","(seeks, 72)","(male, 42)","(looking, 34)","(fun, 31)","(&, 30)","(attractive, 29)","(slim, 27)"
wine.txt,"(., 2826)","(,, 1550)","(***, 608)","(-, 518)","(good, 363)","((, 328)","(), 328)","(quite, 303)","(fruit, 295)","(**, 250)"


In [None]:
len(eliminatedText[0])

463420

## 4. Find the 10 rarest tokens and eliminate them.

__Finding 10 rarest tokens__

In [None]:
def leastFrequent(files):
  # rawText = [webtext.raw(file) for file in files]
  leastCommon = []
  for i in files:
    tokenized = word_tokenize(i)
    c = Counter(tokenized)
    leastCommon.append(c.most_common()[-10:])

  return leastCommon

In [None]:
dfLeastCommon = pd.DataFrame(leastFrequent(eliminatedText))
dfLeastCommon = dfLeastCommon.transpose()
dfLeastCommon.columns = ['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']
dfLeastCommon = dfLeastCommon.transpose()
dfLeastCommon

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
firefox.txt,"(prefix=, 1)","(ctrl-shift-w, 1)","(pads, 1)","(user_pref, 1)","(browser.downloadmanager.behavior, 1)","(tinybmp, 1)","(distracting, 1)","(whell, 1)","(heights, 1)","(workspace, 1)"
grail.txt,"(anybody, 1)","(armed, 1)","(blanket, 1)","(hospital, 1)","(riiight, 1)","(offensive, 1)","(weapon, 1)","(sonny, 1)","(pack, 1)","(cameraman, 1)"
overheard.txt,"(october, 1)","(insisted, 1)","('can, 1)","(booga-wooga-wooga, 1)","(shish-ka-bobba-bobba, 1)","(cockadoodledoo, 1)","(trick-or-treating, 1)","(crapper, 1)","(pseudo-lesbian, 1)","(eccentric, 1)"
pirates.txt,"(shores, 1)","(knows, 1)","(dose, 1)","(boots, 1)","(descends, 1)","(apple, 1)","(juice, 1)","(dripples, 1)","(chin, 1)","(credits, 1)"
singles.txt,"(n/d, 1)","(ala, 1)","(letters, 1)","(answered, 1)","(dont, 1)","(forget.., 1)","(free, 1)","(advertise, 1)","(perfect, 1)","(match, 1)"
wine.txt,"(rhone-like, 1)","(overawed, 1)","(rayas, 1)","(ideally, 1)","(dies, 1)","(average, 1)","(set, 1)","(<, 1)","(/ul, 1)","(>, 1)"


In [None]:
# notice the format of the returned array
leastFrequent(eliminatedText)[0]

[('prefix=', 1),
 ('ctrl-shift-w', 1),
 ('pads', 1),
 ('user_pref', 1),
 ('browser.downloadmanager.behavior', 1),
 ('tinybmp', 1),
 ('distracting', 1),
 ('whell', 1),
 ('heights', 1),
 ('workspace', 1)]

__Removing the 10 rarest tokens from each document__

In [None]:
def removeLeastFrequent(text):
  leastFreq = leastFrequent(text)
  finalText = []
  leastFreqText = []
  for i in range(len(text)):
    # tokenizing text[i]
    tokenedText = word_tokenize(text[i])

    # returning the 10 rare tokens of text
    leastFreqText.append([leastFreq[i][k][0] for k in range(len(leastFreq[0])) ])
    
    # creating new text without the rare tokens
    removed = [j for j in tokenedText if j not in leastFreqText[i]]
    finalText.append(removed)
    
  return finalText


In [None]:
q4 = removeLeastFrequent(eliminatedText)
test = word_tokenize(eliminatedText[0])

In [None]:
print("number of tokens for firefox.txt BEFORE removing 10 rarest tokens:", len(test))
print("number of tokens for firefox.txt removing 10 rarest tokens:", len(q4[0]))

number of tokens for firefox.txt BEFORE removing 10 rarest tokens: 69590
number of tokens for firefox.txt removing 10 rarest tokens: 69580


## 5. Write a function that eliminates numbers and punctuation in a text document.

In [None]:
import re

# output text from question 4
text = q4
def q5(text):
  # rawText = [webtext.raw(file) for file in text]
  outputText = []
  for i in text:
    # Leaving only caracter tokens
    outputText.append([re.sub('[^a-z]+', '', j.lower()) for j in i if re.sub('[^a-z]+', '', j.lower())])

  return outputText

In [None]:
# display text
q5_output = q5(text)
q5_output_text = []
for i in q5_output:
  q5_output_text.append(' '.join(i))

print("Number of documents: ", len(q5_output_text))
print(q5_output_text)

Number of documents:  6


## 6. Write a function that converts numbers in a text document into words (1 => one). Remember to use the inflect package.

#### Since we need numbers for this task, we will first create a function that removes only punctuation and leaves the numbers so we can convert them to words later. 

In [None]:
def removePunct(text):
  # rawText = [webtext.raw(file) for file in text]
  textOutput = []
  for i in text:
    textOutput.append([re.sub('[^a-z0-9]+', ' ', j.lower()) for j  in i if re.sub('[^a-z0-9]+', '', j.lower())])

  return textOutput


In [None]:
# tokens without punctuation
text = q4
q6_no_punct = removePunct(text)
q6_no_punct

In [None]:
# display text
def dispText(text):
  q6_output_text = []
  for i in text:
    q6_output_text.append(' '.join(i))

  return q6_output_text

q6_output_text = dispText(q6_no_punct)
print("Number of documents (removed punctiont): ", len(q6_output_text))
print(q6_output_text)

Number of documents (removed punctiont):  6


#### Numbers to words with inflect

In [None]:
import inflect
inflector = inflect.engine()

In [None]:
def num_to_word(text):
  num2word = []
  for i in text:
    num2word.append([inflector.number_to_words(j) if j.isnumeric() else j for j in i ])
    
  return num2word


In [None]:
text = q6_no_punct
# num to word
q6_num_to_word = num_to_word(text)
# display as text instead of tokens
q6_num_to_word = dispText(q6_num_to_word)

In [None]:
print(q6_num_to_word)



## 7. Write a function for counting emojis in text.

## 8. Apply a porter stemming (from nltk.stem import PorterStemmer).

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
# stemmer.stem("traditional")

In [None]:
def app_porter_stemmer(text):
  output_text = []

  for i in text:
    tknd = word_tokenize(i)
    output_text.append([stemmer.stem(j) for j in tknd])

  return output_text

In [None]:
# Applying the porter stemmer
text = q6_num_to_word
pStemmed = app_porter_stemmer(text)
dispText(pStemmed)

['cooki manag n t allow site set remov cooki set futur cooki stay check full screen mode press ctrl n open new browser download dialog left open add icon context menu call tab bar made proper toolbar given abil collaps expand xul implement cocoa style toolbar custom ifdef moz phoenix custom dialog s toolbar small icon small icon check nightli build tinderboxen phoenix finish tear pref ui piec make suck mozbrows script n t start correct binari need bookmark group icon drop top palett box hork thing keyboard shortcut increas text size broken default phoenix bookmark cust need toolbar spacer spring spacer custom ca n t launch phoenix mozilla run vice versa separ avail toolbar item toolbar layout histori menu back button n t work attempt open back button histori menu second window gener anoth back button toolbar phoenix mozilla comfort share sourc tree phoenix forget cooki even though enabl cooki current session chosen start custom toolbar make current page go blank check satchel successor

## 9. Apply Lancaster stemming and compare with the previous result (from nltk.stem.lancaster import LancasterStemmer).

In [None]:
from nltk.stem.lancaster import LancasterStemmer
stemmer2 = LancasterStemmer()

In [None]:
def app_lancaster_stemmer(text):
  output_text = []

  for i in text:
    tknd = word_tokenize(i)
    output_text.append([stemmer2.stem(j) for j in tknd])

  return output_text

In [None]:
# Applying lancaster stemmer
lStemmed = app_lancaster_stemmer(text)
dispText(lStemmed)

['cooky man n t allow sit set remov cooky set fut cooky stay check ful screen mod press ctrl n op new brows download dialog left op ad icon context menu cal tab bar mad prop toolb giv abl collaps expand xul impl coco styl toolb custom ifdef moz phoenix custom dialog s toolb smal icon smal icon check night build tinderbox phoenix fin tear pref ui piec mak suck mozbrows script n t start correct bin nee bookmark group icon drop top palet box hork thing keyboard shortcut increas text siz brok default phoenix bookmark cust nee toolb spac spring spac custom ca n t launch phoenix mozill run vic vers sep avail toolb item toolb layout hist menu back button n t work attempt op back button hist menu second window gen anoth back button toolb phoenix mozill comfort shar sourc tre phoenix forget cooky ev though en cooky cur sess chos start custom toolb mak cur pag go blank check satchel success wallet mous back menu second window hork thing person toolb nee min height avoid pop new fold ptoolb conte

__Comparing in Porter and Lancaster stemmer__

In [None]:

# list containing stemmed text
listComparison = []
listComparison.append(pStemmed)
listComparison.append(lStemmed)

# creating data frame
dfComparison = pd.DataFrame(listComparison)
dfComparison = dfComparison.transpose()
dfComparison.columns = ['Porter stemmer', 'Lancaster stemmer']
dfComparison

Unnamed: 0,Porter stemmer,Lancaster stemmer
0,"[cooki, manag, n, t, allow, site, set, remov, ...","[cooky, man, n, t, allow, sit, set, remov, coo..."
1,"[scene, one, wind, clop, clop, clop, king, art...","[scen, on, wind, clop, clop, clop, king, arth,..."
2,"[white, guy, plan, even, asian, girl, yeah, an...","[whit, guy, plan, ev, as, girl, yeah, angry, w..."
3,"[pirat, carribean, dead, man, s, chest, ted, e...","[pir, carrib, dead, man, s, chest, ted, elliot..."
4,"[twenty-f, sexi, male, seek, attrac, older, si...","[twenty-five, sexy, mal, seek, attrac, old, si..."
5,"[love, delic, fragrant, rhone, wine, polish, l...","[lov, del, fragr, rhon, win, pol, leath, straw..."


## 10. Take the initial text and apply the lemmatization (from nltk.stem import WordNetLemmatizer).

In [None]:
# necessary imports
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

# test
# lemmatizer.lemmatize('bats')

In [None]:
# Lemmatizing the whole text
def w_net_lemm(text):
  # uncomment the bellow line incase the input text isn't tokenized
  # rawText = [webtext.raw(file) for file in text]

  lemmatized = []
  for i in text:
    # tknd = word_tokenize(i)
    lemmatized.append([lemmatizer.lemmatize(j) for j in i])

  return lemmatized

In [None]:
# execution
text = q6_no_punct
w_net_lemming = w_net_lemm(text)
dispText(w_net_lemming)

 'pirate carribean dead man  s chest ted elliott terry rossio view looking straight rolling swell sound wind thunder low heartbeat scene port royal teacup table rain sheet music music stand rain bouquet white orchid elizabeth sitting rain holding bouquet men rowing men horseback sound thunder eitc logo flag blowing wind many rowboat entering harbor elizabeth sitting alone distance marine running kick door mule seen left barn marine enter liz looking shoulder elizabeth drop bouquet manacle escorted red coat elizabeth swann elizabeth run elizabeth swann happening turner n t know look beautiful elizabeth swann think  s bad luck groom see bride wedding marine cross long ax bar governor entering beckett white hair curl standing mercer lord cutler beckett governor weatherby swann  s long lord cutler beckett lord actually lord cutler beckett fact  do  mister mercer warrant arrest one william turner lord cutler beckett oh  s annoying mistake arrest elizabeth swann charge turner beckett take an

### What is Wordnet ?

Wordnet is a large and publicly available lexical database for many language. It aims to establish structured semantic relationships between words, offering lemmatization capabilities as well and it is one of the earliest and most commonly used lemmatizers.

### What are its supported languages ?

Wordnet has [many versions](http://globalwordnet.org/resources/wordnets-in-the-world/) for other languages, but the English version is currently the most complete one. 
```

