# Latent Dirichlet Allocation

'Latent Dirichlet Allocation (LDA) is a type of probabilistic topic model that is widely used in natural language processing (NLP) and machine learning to discover abstract topics within a collection of documents. The fundamental idea behind LDA is that documents are represented as random mixtures over latent topics, where each topic is characterized by a distribution over words.'

Critically, this technique will look at the type and quantity of words in each review, although it won't take into account the context of the surrounding words.

Steps
1. Load cleaned data
2. Pre-process and tokenize text
3. Grid Search to try to maximize coherence in LDA output. Although I maximized coherence, in reality only two topics were the dominant topics in 99.99% of texts. More work is needed in setting the hyperparameters of this model. 
4. Run model
5. Map dominant topics to each row in the data set. This will now be a feature to predict if the passenger will recommend the flight.

In [None]:
# Import Libraries
import pandas as pd
from tqdm import tqdm 

import gensim, logging, warnings
from gensim.utils import simple_preprocess
from gensim import corpora
from gensim.models import TfidfModel
import gensim.models.ldamodel

from collections import Counter
import re

import nltk
from nltk.util import bigrams, trigrams
from nltk import download
from nltk.corpus import stopwords
nltk.download('stopwords')

import spacy
from spacy.lang.en import English

In [None]:
# Load the cleaned airline review data
df = pd.read_csv('/Users/paulhershaw/brainstation_course/airplane_project/data/airline_reviews_cleaned.csv')

In [None]:
# Function to convert the reviews into a list of words, by setting sentence[2] we look at the review column
def convert(sentences):
    for sentence in tqdm(sentences):
        yield(gensim.utils.simple_preprocess(str(sentence[2]), deacc=True)) 

In [None]:
# collate values from all rows and place into a list
data = df.values.tolist() 

In [None]:
# send to function and return list
data_words = list(convert(data))


In [None]:
# Build a list of words and total counts
# Download necessary NLTK data
download('punkt')

# Assuming data_words is a list of lists where each inner list contains words from a sentence
data_words_flattened = [word for sentence in data_words for word in sentence]

# Count single word occurrences
element_counts = Counter(data_words_flattened)


# Convert counts to DataFrame for single words, bigrams, and trigrams
element_counts_df = pd.DataFrame(element_counts.items(), columns=['Word', 'Count'])


# Optional: Check for duplicates in single words (already in the original code)
has_duplicates = any(count > 1 for count in element_counts.values())



In [None]:
#Export word counts
element_counts_df.to_csv('/Users/paulhershaw/brainstation_course/airplane_project/data/element_counts.csv', index=False)


In [None]:
# Assign stopwords to a variable
nltk.download('stopwords')
stop_words = set(nltk_stopwords.words('english'))  # Use a set for faster lookup

In [None]:
# Define functions for stopwords and lemmatization

'''
 the process_words function takes a list of texts, removes stopwords, and performs lemmatization using spaCy while filtering tokens based on allowed POS tags. 
'''


def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    # Initialize spacy 'en' model
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    
    # Remove stopwords
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in tqdm(texts, desc='Removing stopwords')]
    
    # Lemmatize
    texts_out = []
    for sent in tqdm(texts, desc='Lemmatization'):
        doc = nlp(" ".join(sent))
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [None]:
# Create a list of tokenized text
data_ready_single_words = process_words(data_words)  # Process the text data

In [None]:
# Create the dictionary
lexicon_single_words = corpora.Dictionary(data_ready_single_words)

In [None]:
# Save lexicon to a file
lexicon_single_words.save('lexicon_single_words.dict')

# Convert data_ready to a DataFrame
data_ready_single_words_df = pd.DataFrame(data_ready_single_words)

# Save the DataFrame to a CSV file
data_ready_single_words_df.to_csv('data_ready_single_words.csv', index=False)

### I saved these files, as running this step took time, and it was faster to load from a previously saved file. 

In [None]:
# Use Grid Search to find the best LDA model by coherence score
num_topics_range = [6, 12, 24]  \
alpha_range = [0.001, 0.01, 0.1, 1, 'symmetric', 'asymmetric']  

# Placeholder for storing the results
grid_search_results = []

# Function to train LDA model and compute coherence for a given parameter combination
def train_lda_and_compute_coherence(num_topics, alpha):
    lda_model = LdaModel(
        corpus=corpus,
        id2word=lexicon_single_words,
        num_topics=num_topics,
        random_state=42,
        update_every=1,
        passes=10,
        alpha=alpha,
        iterations=100,
        per_word_topics=True
    )
    
    coherence_model_lda = CoherenceModel(model=lda_model, texts=data_ready_single_words, dictionary=lexicon_single_words, coherence='c_v')
    coherence_lda = coherence_model_lda.get_coherence()
    
    return {
        'num_topics': num_topics,
        'alpha': alpha,
        'coherence': coherence_lda
    }

# Parallelize the grid search using joblib with a custom progress bar
num_cores = 4  # Use up to 4 CPU cores
total_iterations = len(num_topics_range) * len(alpha_range)
iteration_count = 0

for num_topics in num_topics_range:
    for alpha in alpha_range:
        iteration_count += 1
        print(f'Progress: {iteration_count}/{total_iterations}', end='\r')  # progress bar
        
        result = train_lda_and_compute_coherence(num_topics, alpha)
        grid_search_results.append(result)

print("\nBest Model's Params:", best_result['num_topics'], 'Topics and Alpha:', best_result['alpha'])
print("Best Model's Coherence Score:", best_result['coherence'])


In [None]:
# Baased on Grid Search, train the best LDA model
tfidf_single_word = TfidfModel(dictionary=lexicon_single_words, normalize=True)

# Create Corpus: Term Document Frequency
corpus = [tfidf_single_word[lexicon_single_words.doc2bow(text)] for text in tqdm(data_ready_single_words, desc='Corpus')]

# Build LDA model
lda_model_single_word = gensim.models.ldamodel.LdaModel(
    corpus=corpus,
    id2word=lexicon_single_words,
    num_topics=24,
    random_state=42,
    update_every=1,
    passes=5,
    alpha='asymmetric',
    iterations=100,
    per_word_topics=True
)



## LDA Model
Based on the results from the Grid Search, I assigned these hyperparameters.

Unfortunately, the result is still not ideal. There are two dominant topics, that loosely represent negative and positive reviews. 

The other topics are simply not dominant.

More work is needed. 

In [None]:
# Get dominant topic for each review
dominant_topics = []
keywords = []
for i, row_list in enumerate(lda_model_single_word[corpus]):
    row = row_list[0] if lda_model_single_word.per_word_topics else row_list
    # Sort the topics by the contribution (weight)
    row = sorted(row, key=lambda x: (x[1]), reverse=True)
    # Get the dominant topic, its percentage contribution, and keywords
    for j, (topic_num, prop_topic) in enumerate(row):
        if j == 0:  # => dominant topic
            wp = lda_model_single_word.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp])
            dominant_topics.append(int(topic_num))
            keywords.append(topic_keywords)
        else:
            break




In [None]:
# Add the dominant topic and keywords to your DataFrame
df['Dominant_Topic'] = dominant_topics

# View the DataFrame
print(df['Dominant_Topic'].head())

In [None]:
# Create the LDA_Topics column by copying Dominant_Topic
df['Dominant_Topic'] 

In [None]:
#Note almost all the rows land in topic 1 or 0. This si a flaw with the LDA model, and needs further work. 
topic_counts = df['Dominant_Topic'].value_counts()

# Print the counts
print(topic_counts)

In [None]:
# Update 'Dominant_Topic' directly where its value is 8 or 3, set those to 0
df.loc[df['Dominant_Topic'].isin([8, 3]), 'Dominant_Topic'] = 0


In [None]:
#Build a dataframe with the index and the dominant topic
LDA_Topics = df['Dominant_Topic']

In [None]:
LDA_Topics.to_csv('/Users/paulhershaw/brainstation_course/airplane_project/data/LDA_Topics.csv', index=False)