In [44]:
import sys
import importlib
import pyLDAvis.gensim
import pyLDAvis

sys.path.append('./codes')
from codes import LDA 
from codes.LDA import classification
from codes.LDA import add_top_words_to_df
from codes.LDA import lda_and_svm_pipeline
from codes.LDA import LDA_training
from codes.LDA import display_visuals_LDA
from codes.LDA import preprocess_text
from codes.LDA import create_review_topic_matrix
import importlib
importlib.reload(LDA)
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [8]:
import pandas as pd
data = pd.read_csv('../amazon_data/Amazon_Fashion.csv')
data = data[~data["text"].isna()]
data = data[data['rating'] <= 2]
data = data[data['text'].apply(lambda x: len(str(x).split()) > 10)]
data = data.sample(n=10000, random_state=42)
data.shape

(10000, 6)

## Making mistakes on purpose

Missing text col

In [16]:
import pandas as pd

# Small DataFrame without 'text' column
df_no_text = pd.DataFrame({
    'title': ['Post 1', 'Post 2'],
    'content': ['This is the first post', 'This is the second post']
})

# Test the function
try:
    preprocess_text(df_no_text)
except Exception as e:
    print(f"Expected error: {e}")


Unexpected exception in preprocessing text: "Column 'text' does not exist in the data."
Expected error: "Column 'text' does not exist in the data."


In [56]:
model, texts_bow, dictionary = LDA_training(data, True, 8, 100, 1, 100)

Tokenization starting ---
Tokenization done
Numbers removed
Two letter words removed
Written-out numbers removed
Verbs removed SMALL
Lematization done
Stopwords removed
Choosed words removed
Bigrams done
Preprocessing done
--- Model starting ---
Średnia koherencja tematów: -2.9712.
Topic 0: material, cheap, product, quality, would, even, money, good, thin, top
Topic 1: day, return, super, nice, strap, month, apart, part, back, well
Topic 2: smaller, gift, design, new, inside, red, wrist, pink, nice, without
Topic 3: small, way, short, even, great, little, really, would, long, poor
Topic 4: bad, foot, together, within, close, someone, else, type, instead, line
Topic 5: disappoint, color, would, cute, item, order, week, extremely, really, star
Topic 6: white, ring, chain, mask, unfortunately, happy, silver, actual, ship, husband
Topic 7: size, fit, time, large, first, small, band, still, tight, medium


In [34]:
df_missing_text = pd.DataFrame({'title': ['Post 1', 'Post 2']})
preprocess_text(df_missing_text)


Unexpected exception in preprocessing text: "Column 'text' does not exist in data."


KeyError: "Column 'text' does not exist in data."

In [35]:
df_non_string_text = pd.DataFrame({'text': [123, 456]})
preprocess_text(df_non_string_text)


Unexpected exception in preprocessing text: Column 'text' has to be string-type.


TypeError: Column 'text' has to be string-type.

In [42]:
df_empty_text = pd.DataFrame({'text': ['','dssd', '']})
preprocess_text(df_empty_text)


Unexpected exception in preprocessing text: Column 'text' has only NULL or '' values


ValueError: Column 'text' has only NULL or '' values

## LDA errors

In [45]:
import pandas as pd

# DataFrame without 'text' column
df_missing_text_column = pd.DataFrame({
    'title': ['Title 1', 'Title 2'],
    'content': ['Content 1', 'Content 2']
})

# This will raise KeyError since there's no 'text' column
try:
    model, texts_bow, dictionary = LDA_training(df_missing_text_column)
except KeyError as e:
    print(e)


"Column 'text' does not exist in data."


In [46]:
# DataFrame with 'text' column
df_valid = pd.DataFrame({
    'text': ['This is a sample document.', 'Another document here.']
})

# Invalid type for 'with_certain_words_removal' (should be a boolean)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, with_certain_words_removal='yes')
except TypeError as e:
    print(e)


Argument 'with_certain_words_removal' must be a boolean.


In [47]:
# Invalid type for 'n_topics' (not a positive integer)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, n_topics=-1)
except ValueError as e:
    print(e)


Argument 'n_topics' must be a positive integer.


In [48]:
# Invalid type for 'chunksize' (not a positive integer)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, chunksize=-10)
except ValueError as e:
    print(e)


Argument 'chunksize' must be a positive integer.


In [49]:
# Invalid type for 'passes' (not a positive integer)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, passes=0)
except ValueError as e:
    print(e)


Argument 'passes' must be a positive integer.


In [50]:
# Invalid type for 'iterations' (not a positive integer)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, iterations=-100)
except ValueError as e:
    print(e)


Argument 'iterations' must be a positive integer.


In [51]:
# Invalid type for 'update_every' (not a positive integer)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, update_every=0)
except ValueError as e:
    print(e)


Argument 'update_every' must be a positive integer.


In [52]:
# Invalid type for 'eval_every' (not an integer or float)
try:
    model, texts_bow, dictionary = LDA_training(df_valid, eval_every='not_a_number')
except TypeError as e:
    print(e)


Argument 'eval_every' must be an integer or a float.


In [57]:
# Invalid format for 'texts_bow' (not a list of lists of tuples)
invalid_texts_bow = [['word1', 'word2']]  # This is not the correct format
try:
    model, texts_bow, dictionary = LDA_training(df_valid, model, invalid_texts_bow, dictionary)
except TypeError as e:
    print(e)


Argument 'with_certain_words_removal' must be a boolean.


In [58]:
# Invalid 'dictionary' (not a gensim Dictionary)
invalid_dictionary = {'word1': 0, 'word2': 1}  # This is a regular dictionary, not gensim Dictionary
try:
    model, texts_bow, dictionary = LDA_training(df_valid, model, texts_bow, dictionary=invalid_dictionary)
except TypeError as e:
    print(e)


Argument 'with_certain_words_removal' must be a boolean.


In [59]:
# Invalid 'id2word' (not a dictionary)
invalid_id2word = ['word1', 'word2']  # This should be a dictionary, not a list
try:
    model, texts_bow, dictionary = LDA_training(df_valid, model, texts_bow, dictionary,id2word=invalid_id2word, )
except TypeError as e:
    print(e)


Argument 'with_certain_words_removal' must be a boolean.
