In [2]:
import pandas as pd
import numpy as np

# natural language tool kit
import nltk
nltk.download('wordnet')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import regex as regex

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mriva\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mriva\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# Import data from csv
df = pd.read_csv('../datasets/cleaned/combined_text.csv')

# drop duplicates
df.drop_duplicates(subset=['combined_text'], keep='first', ignore_index=True, inplace=True)

# encode category
category_list = list(df.category.unique())
df['label'] = df['category'].map(lambda x:category_list.index(x))

df.drop(columns=['asin'], inplace=True)
df.rename(columns={'combined_text': 'text'}, inplace=True)

In [4]:
df.head()

Unnamed: 0,text,category,label
0,I have a 9 year old Badger 1 that needs replac...,appliances,0
1,model number This may help InSinkErator Model ...,appliances,0
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,appliances,0
3,Does this come with power cord and dishwasher ...,appliances,0
4,loud noise inside when turned on. sounds like ...,appliances,0


In [6]:
def get_stopwords(file='../datasets/stopwords/stopwords.txt'):
    '''
    Description: Retreives stopwords from text file
    ---
    params:

    file:str       - file path
    ---
    output:
    
    set {str} of stopwords:
    {'a', 'set', 'of', 'words', ...}
    ---
    usage:
    >> list(get_stopwords())
    ['a', 'set', 'of', 'words', ...]
    '''
    with open(file,'r') as words:
        stopwords = [word.strip() for word in words]
        return set(stopwords)

In [7]:
def preprocess_text(txt, analyzer = 'stem'):
    '''
    Description: Tokenizes alphabets only. then Choose either to stem or lemmatize.
    ---
    params:

    txt:str       - text to process

    analyzer:str  - 'stem' = PorterStemmer (default)
                  - 'lemm' = WordNetLemmatizer
    ---
    output:
    
    processed text: str 
    ---
    usage:
    >> text = 'I have a 9 year old Badger 1 that needs'
    >> preprocess_text(text, analyzer='stem'))
    'year old badger need'
    # or however the the stemmed output should look like
    '''
    tokenizer = RegexpTokenizer(r'[A-Za-z]+', gaps=False)
    tokens = tokenizer.tokenize(str(txt).lower())
    
    #remove stop words
    all_stopwords = stopwords.words('english')
    all_stopwords.extend(list(get_stopwords()))
    tokens = [w for w in tokens if w not in all_stopwords]
    
    # analyze using lemmatizer
    if analyzer == 'lemm':
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(s) for s in tokens]
        
    # analyze using stem
    elif analyzer == 'stem':
        p_stemmer = PorterStemmer()
        tokens = [p_stemmer.stem(s) for s in tokens]
    
    
    text = " ".join(tokens)
    return(text)

In [8]:
df['stem_text'] = df['text'].apply(lambda text: preprocess_text(text, analyzer='stem'))

In [10]:
%%time
df['lemm_text'] = df['text'].apply(lambda text: preprocess_text(text, analyzer='lemm'))

Wall time: 7min 12s


In [11]:
df.head()

Unnamed: 0,text,category,label,stem_text,lemm_text
0,I have a 9 year old Badger 1 that needs replac...,appliances,0,year old badger need replac badger instal like...,year old badger need replacing badger install ...
1,model number This may help InSinkErator Model ...,appliances,0,model number may help insinker model badger ba...,model number may help insinkerator model badge...
2,can I replace Badger 1 1/3 with a Badger 5 1/2...,appliances,0,replac badger badger connect plumb connect var...,replace badger badger connection plumbing conn...
3,Does this come with power cord and dishwasher ...,appliances,0,come power cord dishwash hook come power cord ...,come power cord dishwasher hook come power cor...
4,loud noise inside when turned on. sounds like ...,appliances,0,loud nois insid turn sound like blade loos che...,loud noise inside turned sound like blade loos...


In [13]:
filename = f"../datasets/cleaned/combined_stemmed_text_with_category_label.csv"
df.to_csv(filename, index=False)