In [96]:
#Computing Analytics Reasearch--Topic Modeling
#Author: Nikki Kudamik
#Date: 2/22/24

import numpy as np
import pandas as pd
import re
import nltk #make sure to run: nltk.download('stopwords') in another cell or before running program
from nltk.corpus import stopwords
", ".join(stopwords.words('english'))
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer #make sure to run: nltk.download('wordnet'), nltk.download('averaged_perceptron_tagger')
from collections import Counter
import string

#must be declared before FREQWORDS global variable
cnt = Counter()
for text in df["text_wo_stop"].values:
    for word in text.split():
        cnt[word] += 1
cnt.most_common(10)

lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

PUNCT_TO_REMOVE = string.punctuation
STOPWORDS = set(stopwords.words('english'))
FREQWORDS = set([w for (w, wc) in cnt.most_common(10)])

pd.options.mode.chained_assignment = None #??

#FUNCTION DEFINITIONS
def remove_punctuation(text):
    """custom function to remove punctuation"""
    return text.translate(str.maketrans('', '', PUNCT_TO_REMOVE))
def remove_stopwords(text):
    """custom function to remove stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])
def remove_freqwords(text):
    """custom function to remove frequent words"""
    return " ".join([word for word in str(text).split() if word not in FREQWORDS])
def stem_words(text):
    """custom function to reduce words to their root form"""
    return " ".join([stemmer.stem(word) for word in str(text).split()])
def remove_smallwords(text):
    """custom function to remove words that consist of less than three letters"""
    return re.sub(r'\b\w{1,3}\b', '', text)
def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN)) for word, pos in pos_tagged_text])

def clean_reviews(reviews):
    #call remove_punctuation()
    df["text_wo_punct"] = reviews.apply(lambda text:remove_punctuation(text))
    
    #call to remove_stopwords()
    df["text_wo_stop"] = df["text_wo_punct"].apply(lambda text:remove_stopwords(text))

    #call to remove_freqwords()
    df["text_wo_freq"] = df["text_wo_stop"].apply(lambda text:remove_freqwords(text))
    
    #call to remove_smallwords
    df["text_wo_smallwords"] = df["text_wo_freq"].apply(lambda text:remove_smallwords(text))
    
    #call to lemmatize_words()
    df["text_lemmatized"] =  df["text_wo_smallwords"].apply(lambda text:lemmatize_words(text))
    
    return df["text_lemmatized"]    

#MAIN PROGRAM
def main():

    #manually change dataset.xlsx to dataset.csv
    full_df = pd.read_csv("dataset.csv")
    df = full_df[["review"]] #insert column name in "", so in this case it is "review"
    df["review"] = df["review"].astype(str)
    #full_df.head()

    #numpy array OPTION 1
        #reviews = df["review"].to_numpy() #array of reviews
        #print(reviews)

    #numpy array OPTION 2
    reviews = df["review"]
    #reviews_array = np.array(reviews)
    #print(reviews_array)

    #would it be possible to make the reviews df to a numpy array after the processing done in clean_reviews?
    processed_reviews = clean_reviews(reviews) #.to_numpy()

    
    print(processed_reviews)
    
if __name__ == "__main__":
    main()
    
    

0      use quite time also member decide would good w...
1      able currently cache sadly keep charge tell do...
2      Great little buggy moment load full zoom trail...
3      need cash phone please want cash away back cas...
4      Theres literally three entire city big one cou...
                             ...                        
195    okay prefer different find higher difficult ra...
196    Caches free website consider pay original didn...
197    Used enjoy occasionally however almost need ac...
198    really engage still need careful go think need...
199    open every week user interaction cause whateve...
Name: text_lemmatized, Length: 200, dtype: object


In [81]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikki\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True