<a href="https://colab.research.google.com/github/mohanrajmit/ML-training/blob/master/Copy_of_lem.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Supplemental Data Cleaning: Using a Lemmatizer

### Test out WordNet lemmatizer (read more about WordNet [here](https://wordnet.princeton.edu/))

In [None]:
import nltk
nltk.download('wordnet')

wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
dir(wn)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 'lemmatize',
 'unicode_repr']

In [None]:
print(ps.stem('meanness'))
print(ps.stem('meaning'))
print(ps.stem('run'))
print(ps.stem('running'))
print(ps.stem('runner'))

print(ps.stem("Meanness"))
print(ps.stem("meaning"))


mean
mean
run
run
runner
mean
mean


In [None]:
print(wn.lemmatize('meanness'))
print(wn.lemmatize('meaning'))

meanness
meaning


In [None]:
tokenized = ['test','in','the','rest','of','for','new','last']
result = [word for word in tokenized if word not in ['in','on','the','of','for']]
print(result)

['test', 'rest', 'new', 'last']


In [None]:
print(ps.stem('goose'))
print(ps.stem('geese'))

goos
gees


In [None]:
import re
print(re.split('\W+',"some of the-words are+combined"))

['some', 'of', 'the', 'words', 'are', 'combined']


In [None]:
print(wn.lemmatize('goose'))
print(wn.lemmatize('geese'))

goose
goose


### Read in raw text

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import pandas as pd
import re
import string
pd.set_option('display.max_colwidth', 100)

stopwords = nltk.corpus.stopwords.words('english')

data = pd.read_csv("/content/output.csv")
#data.columns = ['label', 'body_text']

data.head()

Unnamed: 0,Text,Label
0,"الشركة العربية للسيارات مم۔\nNISSAN\nArabian Automobiles Co. l.l.c.\nINFINITI,\nPO Box: 2128, Du...",delivery note
1,"}\nNISSAN\nAl Masaood Automobiles Company L.L.C.\nP.O. Box 322, Abu Dhabi - United Arab Emiratos...",delivery note
2,FLEET DELIVERY NOTE\nDelivered To\nALAIN MOTORS RENT A CAR\nAddress\nDelivery Number\nInvoice Nu...,delivery note
3,FLEET DELIVERY NOTE\nDelivered To\nDelivery Number\nInvoice Number\nDelivery Date\nInvoice Accou...,delivery note
4,"NISSAN\nAl Masagod Automobiles Company L.L.C.\nP.O.Box 322, Abu Dhaby - United Arab Emirates\nH....",delivery note


### Clean up text

In [None]:
def clean_text(text):
    text = "".join([word for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [word for word in tokens if word not in stopwords]
    return text

data['body_text_nostop'] = data['Text'].apply(lambda x: clean_text(x.lower()))

data.head()

Unnamed: 0,Text,Label,body_text_nostop
0,"الشركة العربية للسيارات مم۔\nNISSAN\nArabian Automobiles Co. l.l.c.\nINFINITI,\nPO Box: 2128, Du...",delivery note,"[الشركة, العربية, للسيارات, مم, nissan, arabian, automobiles, co, llc, infiniti, po, box, 2128, ..."
1,"}\nNISSAN\nAl Masaood Automobiles Company L.L.C.\nP.O. Box 322, Abu Dhabi - United Arab Emiratos...",delivery note,"[, nissan, al, masaood, automobiles, company, llc, po, box, 322, abu, dhabi, united, arab, emira..."
2,FLEET DELIVERY NOTE\nDelivered To\nALAIN MOTORS RENT A CAR\nAddress\nDelivery Number\nInvoice Nu...,delivery note,"[fleet, delivery, note, delivered, alain, motors, rent, car, address, delivery, number, invoice,..."
3,FLEET DELIVERY NOTE\nDelivered To\nDelivery Number\nInvoice Number\nDelivery Date\nInvoice Accou...,delivery note,"[fleet, delivery, note, delivered, delivery, number, invoice, number, delivery, date, invoice, a..."
4,"NISSAN\nAl Masagod Automobiles Company L.L.C.\nP.O.Box 322, Abu Dhaby - United Arab Emirates\nH....",delivery note,"[nissan, al, masagod, automobiles, company, llc, pobox, 322, abu, dhaby, united, arab, emirates,..."


### Lemmatize text

In [None]:
def lemmatizing(tokenized_text):
    text = [wn.lemmatize(word) for word in tokenized_text]
    return text

data['body_text_lemmatized'] = data['body_text_nostop'].apply(lambda x: lemmatizing(x))

data.head(10)

Unnamed: 0,Text,Label,body_text_nostop,body_text_lemmatized
0,"الشركة العربية للسيارات مم۔\nNISSAN\nArabian Automobiles Co. l.l.c.\nINFINITI,\nPO Box: 2128, Du...",delivery note,"[الشركة, العربية, للسيارات, مم, nissan, arabian, automobiles, co, llc, infiniti, po, box, 2128, ...","[الشركة, العربية, للسيارات, مم, nissan, arabian, automobile, co, llc, infiniti, po, box, 2128, d..."
1,"}\nNISSAN\nAl Masaood Automobiles Company L.L.C.\nP.O. Box 322, Abu Dhabi - United Arab Emiratos...",delivery note,"[, nissan, al, masaood, automobiles, company, llc, po, box, 322, abu, dhabi, united, arab, emira...","[, nissan, al, masaood, automobile, company, llc, po, box, 322, abu, dhabi, united, arab, emirat..."
2,FLEET DELIVERY NOTE\nDelivered To\nALAIN MOTORS RENT A CAR\nAddress\nDelivery Number\nInvoice Nu...,delivery note,"[fleet, delivery, note, delivered, alain, motors, rent, car, address, delivery, number, invoice,...","[fleet, delivery, note, delivered, alain, motor, rent, car, address, delivery, number, invoice, ..."
3,FLEET DELIVERY NOTE\nDelivered To\nDelivery Number\nInvoice Number\nDelivery Date\nInvoice Accou...,delivery note,"[fleet, delivery, note, delivered, delivery, number, invoice, number, delivery, date, invoice, a...","[fleet, delivery, note, delivered, delivery, number, invoice, number, delivery, date, invoice, a..."
4,"NISSAN\nAl Masagod Automobiles Company L.L.C.\nP.O.Box 322, Abu Dhaby - United Arab Emirates\nH....",delivery note,"[nissan, al, masagod, automobiles, company, llc, pobox, 322, abu, dhaby, united, arab, emirates,...","[nissan, al, masagod, automobile, company, llc, pobox, 322, abu, dhaby, united, arab, emirate, h..."
5,BROKER\nSINCLG\nPARG DELIVERY NOTE\nAmer\n055550 1986\nAwir Automarket\nالشركة العربية للسيارات ...,delivery note,"[broker, sinclg, parg, delivery, note, amer, 055550, 1986, awir, automarket, الشركة, العربية, لل...","[broker, sinclg, parg, delivery, note, amer, 055550, 1986, awir, automarket, الشركة, العربية, لل..."
6,"BROKER MULIP\nMuciP PAGE\nPAGE DELIVERY NOTE\n,\nBuraya\nالشركة العربية للسيارات د...\nArablan A...",delivery note,"[broker, mulip, mucip, page, page, delivery, note, buraya, الشركة, العربية, للسيارات, د, arablan...","[broker, mulip, mucip, page, page, delivery, note, buraya, الشركة, العربية, للسيارات, د, arablan..."
7,"Collection\nIbrahin\nالشركة العربية للسيارات حمم\nNISSAN\nArablan Automobiles Co, LLC,\nINFINITI...",delivery note,"[collection, ibrahin, الشركة, العربية, للسيارات, حمم, nissan, arablan, automobiles, co, llc, inf...","[collection, ibrahin, الشركة, العربية, للسيارات, حمم, nissan, arablan, automobile, co, llc, infi..."
8,"Al Masaood Autornabiles Company LLC,\nPO, Box 322. Abu Dhabi - United Arab Emiraios\nFiect Najda...",delivery note,"[al, masaood, autornabiles, company, llc, po, box, 322, abu, dhabi, united, arab, emiraios, fiec...","[al, masaood, autornabiles, company, llc, po, box, 322, abu, dhabi, united, arab, emiraios, fiec..."
9,NISSAN\nAl Masaoad Automobiles Company L.L.C.\nP.O.Bax S22. Abu Dhabi - United Arab Emirates\nFl...,delivery note,"[nissan, al, masaoad, automobiles, company, llc, pobax, s22, abu, dhabi, united, arab, emirates,...","[nissan, al, masaoad, automobile, company, llc, pobax, s22, abu, dhabi, united, arab, emirate, f..."


In [None]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [ps.stem(word) for word in tokens if word not in stopwords]
    return text

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:

from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer(analyzer=clean_text)
X_counts = count_vect.fit_transform(data['Text'])
print(X_counts.shape)
print(count_vect.get_feature_names())

(32, 2673)
['', '0', '00', '000', '0000000000', '0000016315', '0000017047', '0000018367', '0000211019', '00003', '0000411965', '0001216', '0001231', '0002', '0003947', '001', '0085010549', '0085011777', '009073', '00971504884177', '00971523578989', '00vf', '01', '010', '0119760', '01950335', '02', '0202631', '02150092', '02217', '02488353', '0267223', '026766323', '028848322', '02nov2019', '03', '0302664', '0302680', '0302681', '0302698', '0302701', '0302702', '0302707', '03112019', '035000', '037219999', '037222216', '04', '0405579', '042671287', '042950333', '042952222', '042952229', '042952702', '042952722', '042952828', '043080036', '043331006', '043390999', '043900036', '043980036', '045', '047950333', '04dec19', '05010895', '0506928039', '05082019', '0522238183', '055550', '0558694459', '0564139973', '05dec2019', '060', '0607666158', '0609821', '062776745', '062776746', '065396111', '065398111', '065398114', '06539811ajman', '06539b111', '065542333', '065985300', '065986300', '06

In [None]:
y=data["Label"]

In [None]:
y.shape

(32,)

In [None]:
y[0]

'delivery note'