In [1]:
import pandas as pd
from collections import Counter
import re
import nltk

###### Reading data into a pandas dataframe

In [2]:
df = pd.DataFrame.from_csv('C:/Users/Adroit/GoogleDrive/MScA/nlp/assignment_2/Food_Inspections.csv')

  """Entry point for launching an IPython kernel.


###### Dropping rows where we don't have any violations

In [3]:
df.dropna(subset=['Violations'], inplace=True)

###### Dropping rows where Result is not fail as we care about failed food inspections only

In [4]:
df = df[df['Results'] == 'Fail']

###### Some restaurants have more than 1 violations. These violations are delimited by "|". We are applying regex to split all violations into a list

In [5]:
df['Violations_list'] = df['Violations'].apply(lambda x: re.split("[|]+", x))

In [6]:
violations = list()
for violationlist in df['Violations_list'].tolist():
    violations += violationlist

###### Removing violation information and keeping just the comments information

In [7]:
violations_comments = list()

for violation in violations:
    violation_info = re.split("- Comments:", violation)
    if len(violation_info) > 1:
        violations_comments.append(violation_info[1].strip().upper())

In [8]:
all_violations = ",".join(violations_comments)

###### Finding the most frequently used word

In [9]:
words = nltk.tokenize.word_tokenize(all_violations)
fdist = nltk.FreqDist(words)

print(fdist)

#fdist.items() - will give all words
fdist.most_common(10)

<FreqDist with 76286 samples and 7633457 outcomes>


[(',', 470378),
 ('AND', 330387),
 ('.', 291105),
 ('THE', 228205),
 ('IN', 203706),
 ('TO', 156344),
 ('OF', 141939),
 ('ALL', 113085),
 ('MUST', 108290),
 ('ON', 88879)]

###### Cleaning the text to remove stop words and then counting again

In [10]:

words = nltk.tokenize.word_tokenize(all_violations)

stopwords = set(nltk.corpus.stopwords.words('english'))

# Remove punctuation
words = [word for word in words if word.isalpha()]

# Lowercase all words (default_stopwords are lowercase too)
words = [word.lower() for word in words]

# Remove stopwords
words = [word for word in words if word not in stopwords]

fdist = nltk.FreqDist(words)

print(fdist)

fdist.most_common(50)

<FreqDist with 19516 samples and 4183643 outcomes>


[('must', 108290),
 ('food', 85730),
 ('clean', 84224),
 ('instructed', 68999),
 ('area', 63342),
 ('prep', 56990),
 ('observed', 54384),
 ('sink', 52765),
 ('floor', 45613),
 ('storage', 43406),
 ('repair', 41818),
 ('violation', 39562),
 ('shall', 39047),
 ('provide', 37679),
 ('rear', 34545),
 ('maintain', 32925),
 ('remove', 32680),
 ('water', 30256),
 ('cooler', 29058),
 ('serious', 28495),
 ('door', 27402),
 ('equipment', 27295),
 ('walls', 26586),
 ('areas', 24935),
 ('inside', 23860),
 ('ice', 22128),
 ('citation', 22031),
 ('replace', 21853),
 ('pest', 21707),
 ('front', 21681),
 ('hand', 21052),
 ('floors', 21005),
 ('compartment', 20646),
 ('room', 19479),
 ('wall', 19227),
 ('found', 18704),
 ('control', 18359),
 ('premises', 17853),
 ('kitchen', 17783),
 ('manager', 17552),
 ('detail', 17456),
 ('machine', 17285),
 ('basement', 17227),
 ('hot', 17192),
 ('stored', 16654),
 ('service', 16652),
 ('ceiling', 16379),
 ('droppings', 16165),
 ('behind', 16105),
 ('grease', 15816

###### Stemming

In [11]:
porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

###### Porter

In [12]:
porter_stemmed_words = [porter.stem(t) for t in words]

In [13]:
fdist = nltk.FreqDist(porter_stemmed_words)

print(fdist)

fdist.most_common(50)

<FreqDist with 14488 samples and 4183643 outcomes>


[('must', 108293),
 ('clean', 106435),
 ('food', 100134),
 ('area', 88278),
 ('instruct', 69109),
 ('floor', 66850),
 ('sink', 62278),
 ('prep', 57258),
 ('observ', 54691),
 ('provid', 46054),
 ('wall', 45813),
 ('violat', 45081),
 ('repair', 43673),
 ('storag', 43466),
 ('cooler', 40940),
 ('maintain', 40534),
 ('shall', 39048),
 ('remov', 35785),
 ('door', 35563),
 ('rear', 34549),
 ('water', 30272),
 ('equip', 28803),
 ('seriou', 28503),
 ('store', 27393),
 ('sanit', 26549),
 ('insid', 23870),
 ('hand', 23796),
 ('room', 23625),
 ('manag', 23106),
 ('replac', 23019),
 ('ceil', 22570),
 ('ice', 22190),
 ('citat', 22136),
 ('use', 22078),
 ('pest', 22038),
 ('front', 21682),
 ('compart', 21673),
 ('shelv', 21671),
 ('machin', 21155),
 ('detail', 19197),
 ('found', 18707),
 ('cook', 18663),
 ('control', 18438),
 ('wash', 18282),
 ('premis', 18168),
 ('kitchen', 17866),
 ('light', 17829),
 ('servic', 17804),
 ('rodent', 17426),
 ('basement', 17274)]

###### Lancaster

In [14]:
lancaster_stemmed_words = [lancaster.stem(t) for t in words]

In [15]:
fdist = nltk.FreqDist(lancaster_stemmed_words)

print(fdist)

fdist.most_common(50)

<FreqDist with 12447 samples and 4183643 outcomes>


[('cle', 114439),
 ('must', 108295),
 ('food', 100134),
 ('stor', 70887),
 ('prep', 69154),
 ('instruct', 69137),
 ('flo', 67358),
 ('are', 63342),
 ('sink', 62280),
 ('observ', 54690),
 ('provid', 46079),
 ('wal', 45825),
 ('viol', 45122),
 ('repair', 43673),
 ('cool', 41471),
 ('maintain', 40545),
 ('shal', 39059),
 ('remov', 35786),
 ('door', 35565),
 ('rear', 34549),
 ('wat', 30309),
 ('equip', 28804),
 ('sery', 28500),
 ('sanit', 27655),
 ('found', 25889),
 ('area', 24937),
 ('bas', 24262),
 ('serv', 24146),
 ('insid', 23871),
 ('hand', 23802),
 ('room', 23627),
 ('man', 23593),
 ('replac', 23022),
 ('us', 22854),
 ('ceil', 22577),
 ('ic', 22192),
 ('cit', 22157),
 ('pest', 22041),
 ('front', 21682),
 ('compart', 21681),
 ('shelv', 21671),
 ('prop', 21528),
 ('machin', 21159),
 ('op', 19245),
 ('detail', 19197),
 ('cook', 18814),
 ('wash', 18735),
 ('control', 18438),
 ('prem', 18169),
 ('kitch', 17883)]

###### Lancaster stemming does not seem like a good idea as it is difficult to interpret some of the words, such as cle, flo

###### Lammatization

In [16]:
wnl = nltk.WordNetLemmatizer()

In [17]:
lemmatized_words = [wnl.lemmatize(t) for t in words]

In [18]:
fdist = nltk.FreqDist(lemmatized_words)

print(fdist)

fdist.most_common(50)

<FreqDist with 17864 samples and 4183643 outcomes>


[('must', 108293),
 ('food', 100133),
 ('area', 88279),
 ('clean', 84229),
 ('instructed', 68999),
 ('floor', 66618),
 ('sink', 62278),
 ('prep', 56998),
 ('observed', 54384),
 ('wall', 45813),
 ('violation', 45077),
 ('storage', 43459),
 ('repair', 42047),
 ('cooler', 40940),
 ('shall', 39047),
 ('provide', 37679),
 ('door', 35562),
 ('rear', 34545),
 ('maintain', 32925),
 ('remove', 32694),
 ('water', 30269),
 ('equipment', 28592),
 ('serious', 28495),
 ('inside', 23868),
 ('hand', 23776),
 ('room', 23624),
 ('ceiling', 22569),
 ('citation', 22134),
 ('ice', 22128),
 ('pest', 22038),
 ('replace', 21853),
 ('front', 21682),
 ('compartment', 21660),
 ('machine', 21154),
 ('found', 18707),
 ('manager', 18384),
 ('control', 18368),
 ('premise', 18164),
 ('kitchen', 17866),
 ('detail', 17473),
 ('rodent', 17425),
 ('basement', 17274),
 ('hot', 17192),
 ('service', 17064),
 ('stored', 16654),
 ('droppings', 16165),
 ('behind', 16105),
 ('grease', 15823),
 ('interior', 15784),
 ('issued', 1

###### Lemmatization gives us good overview of the common keywords which lead to failed inspection. All keywords are valid words from the english dictionary