In [32]:
import re
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

In [43]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/lynx/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/lynx/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lynx/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /home/lynx/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package stopwords to /home/lynx/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
df_path = "../data/raw/complaints.csv"
df = pd.read_csv(df_path)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5075 entries, 0 to 5074
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5075 non-null   int64 
 1   complaint   5075 non-null   object
 2   category    5075 non-null   object
 3   emotion     5075 non-null   object
 4   department  5075 non-null   object
dtypes: int64(1), object(4)
memory usage: 198.4+ KB


Remove Duplicates

In [4]:
df = df.drop(index=df[df[["complaint", "emotion", "category"]].duplicated()].index).reset_index().drop(columns=["index"])

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,complaint,category,emotion,department
0,0,A dog chased and attacked a child in our lane.,safety,worry,Municipality
1,1,A stray cattle blocked the road and caused an ...,safety,worry,Municipality
2,2,A stray dog attacked a child near beside the p...,safety,anger,Police
3,3,A stray dog attacked a child near beside the p...,safety,disgust,Police
4,4,A stray dog attacked a child near beside the p...,safety,disgust,Police


## Remove Unwanted columns

In [6]:
df.columns

Index(['Unnamed: 0', 'complaint', 'category', 'emotion', 'department'], dtype='object')

In [7]:
df = df.drop(columns=["Unnamed: 0"])
df.head()

Unnamed: 0,complaint,category,emotion,department
0,A dog chased and attacked a child in our lane.,safety,worry,Municipality
1,A stray cattle blocked the road and caused an ...,safety,worry,Municipality
2,A stray dog attacked a child near beside the p...,safety,anger,Police
3,A stray dog attacked a child near beside the p...,safety,disgust,Police
4,A stray dog attacked a child near beside the p...,safety,disgust,Police


#### Departments

In [8]:
df["department"].value_counts()

department
Police               993
Food Safety Dept.    961
Municipality         931
Health Dept.         931
Sanitation Dept.     904
Name: count, dtype: int64

#### Emotions

In [9]:
df["emotion"].value_counts()

emotion
disgust        607
concern        605
anger          603
worry          597
neutral        589
sadness        589
urgency        568
frustration    562
Name: count, dtype: int64

#### Category

In [10]:
df["category"].value_counts()

category
safety     998
food       961
illness    931
water      926
waste      904
Name: count, dtype: int64

#### Text Preprocessing

Lowercasing & Remove extra spaces 

In [11]:
df["complaint"].sample(10)

542     Child developed rashes after drinking municipa...
1728    I suspect adulteration in the cooking oil used...
3979    Suspicious activities noticed near near the te...
263     Biscuit packet bought from milkman in near the...
2770    Pipeline leakage near in the municipal ward, M...
3180    Restaurant in near the temple, Mysuru served s...
3379    Sewage water overflowing onto the road in near...
1832    Kitchen waste attracting stray animals in behi...
4567    Vegetables in the near the market market of Vi...
2294    No water supply since morning in outside the s...
Name: complaint, dtype: object

In [14]:
clean_fn = lambda x: re.sub(r'\s+', ' ', x).strip().lower()
df["complaint"] = df["complaint"].apply(clean_fn)

In [15]:
df["complaint"].sample(10)

3772    students fell ill after eating from the fruit ...
4535    vegetables in the beside the park market of be...
3454    stagnant water near next to the metro station ...
3103    respiratory issues increasing due to pollution...
3327    sewage leak on the road in on the main road, c...
3555    stray dogs attacking pedestrians in opposite t...
382     borewell dried up; families in opposite the ho...
3404    sewage water overflowing onto the road in oppo...
1480    garbage truck skipped collection for a week in...
4515    uncollected kitchen waste is attracting stray ...
Name: complaint, dtype: object

#### Remove punctuation

In [16]:
remove_punt = lambda x: re.sub(r'[^a-zA-Z\s]', '', x)
df["complaint"] = df["complaint"].apply(remove_punt)

In [17]:
df["complaint"].sample(5)

3067    respiratory issues increasing due to pollution...
4423    trash burning openly in opposite the hospital ...
892     dumpyard near in the municipal ward releases f...
4003    tap water in at the bus stop of patna has been...
594     construction debris dumped illegally near in o...
Name: complaint, dtype: object

#### Tokenizing

In [19]:
tokenize = lambda x: word_tokenize(x)

tokenized_complaints = df["complaint"].apply(tokenize).to_list()
tokenized_complaints

[['a', 'dog', 'chased', 'and', 'attacked', 'a', 'child', 'in', 'our', 'lane'],
 ['a',
  'stray',
  'cattle',
  'blocked',
  'the',
  'road',
  'and',
  'caused',
  'an',
  'accident'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'nagpur'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'nagpur'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'thiruvananthapuram'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'by',
  'the',
  'lake',
  'bhopal'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'bhopal'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'kolkata'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'mumbai'],
 ['a',
  'stray',
  '

#### Lemmatization 

In [47]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [44]:
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN 

In [48]:
def preprocess(tokens):
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)

    # lemmatize with POS
    lemmas = [
        lemmatizer.lemmatize(word, get_wordnet_pos(pos))
        for word, pos in pos_tags
    ]

    return " ".join(lemmas)

In [49]:
preprocess(tokenized_complaints[0])

'a dog chase and attack a child in our lane'

In [51]:
tokenized_complaints[0]  # seems cool

['a', 'dog', 'chased', 'and', 'attacked', 'a', 'child', 'in', 'our', 'lane']

In [55]:
tokenized_complaints

[['a', 'dog', 'chased', 'and', 'attacked', 'a', 'child', 'in', 'our', 'lane'],
 ['a',
  'stray',
  'cattle',
  'blocked',
  'the',
  'road',
  'and',
  'caused',
  'an',
  'accident'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'nagpur'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'nagpur'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'beside',
  'the',
  'park',
  'thiruvananthapuram'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'by',
  'the',
  'lake',
  'bhopal'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'bhopal'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'kolkata'],
 ['a',
  'stray',
  'dog',
  'attacked',
  'a',
  'child',
  'near',
  'in',
  'our',
  'colony',
  'mumbai'],
 ['a',
  'stray',
  '

In [54]:
list(map(preprocess, tokenized_complaints))

['a dog chase and attack a child in our lane',
 'a stray cattle block the road and cause an accident',
 'a stray dog attack a child near beside the park nagpur',
 'a stray dog attack a child near beside the park nagpur',
 'a stray dog attack a child near beside the park thiruvananthapuram',
 'a stray dog attack a child near by the lake bhopal',
 'a stray dog attack a child near in our colony bhopal',
 'a stray dog attack a child near in our colony kolkata',
 'a stray dog attack a child near in our colony mumbai',
 'a stray dog attack a child near in our colony visakhapatnam',
 'a stray dog attack a child near in the municipal ward bhopal',
 'a stray dog attack a child near in the municipal ward mumbai',
 'a stray dog attack a child near in the municipal ward patna',
 'a stray dog attack a child near in the municipal ward trichy',
 'a stray dog attack a child near in the municipal ward trichy',
 'a stray dog attack a child near inside the hostel kochi',
 'a stray dog attack a child near

Vectorization / Embeddings (TF-IDF, Word2Vec, BERT)

vectorixation and stop word removing and negation handling are in each model training

Save dataset as emotion base & category base

In [None]:
(
    df[["complaint", "emotion"]]
        .sort_values(ascending=True, by="emotion", ignore_index=True)
        .to_csv("../data/cleaned/complaints_by_emotion.csv", index=False)
)

In [None]:
(
    df[["complaint", "category"]]
        .sort_values(ascending=True, by="category", ignore_index=True)
        .to_csv("../data/cleaned/complaints_by_category.csv", index=False)
)