## Preprocessing steps, i thought we did something more complex for intro but this is what i can share that is applicable to our project.

In [None]:
# see articles that are in English

non_en_count = 0
for index, row in ec_data.iterrows():
    lang = detect(row['Text'])
    if lang != 'en':  # 'en' represents English
        non_en_count +=1
print(f'Number of non-English articles: {non_en_count}')

In [None]:
# 0 - lowercase, 1 - stemming, 2 - lemmatizing 
mod=2

# Pre-process texts
text_preproc = (
    ec_data.Text
    .astype(str)
    .progress_apply(lambda row: tokenize(row, mod))
)

ec_data["text_preproc"]=text_preproc

print("Done with text!")

# Pre-process titles
tit_preproc = (
    ec_data.Title
    .astype(str)
    .progress_apply(lambda row: tokenize(row, mod))
)

ec_data["titles_preproc"]=tit_preproc

print("Done with titles!")
ec_data.info()


In [None]:
# Check for null values
wh_data.isna().sum()

## Exploratory Data analsysis suggestion, this basically runs cv, td-idf and lda for the parameters we choose and prints every output to a csv. this is important to then select the parametrs for min df and max df.

In [None]:

def analyze_text_data(data, min_df, max_df):
    # Previous setup for CountVectorizer and TfidfVectorizer
    
    # Initialize CountVectorizer with specified parameters
    cv = CountVectorizer(ngram_range=(1,2), lowercase=True, min_df=min_df, max_df=max_df, stop_words='english')
    
    # Fit CountVectorizer and transform text and title data
    cv.fit(data.text_preproc)
    vectorized_text = cv.transform(data.text_preproc).toarray()
    vectorized_title = cv.transform(data.titles_preproc).toarray()
    vectorized_text_title_weight = 2 * vectorized_title + vectorized_text
    
    # Apply SVD to CountVectorizer output
    svd = TruncatedSVD(n_components=10)
    U = svd.fit_transform(vectorized_text_title_weight)
    VT = svd.components_
    
    # Initialize TfidfVectorizer with specified parameters
    tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=min_df, max_df=max_df)
    
    # Fit and transform text data using TfidfVectorizer
    tfidf.fit(data.text_preproc)  # Adjusted to fit the correct dataset
    tfidf_matrix_text = tfidf.transform(data.text_preproc)
    tfidf_matrix_title = tfidf.transform(data.titles_preproc)
    
    # Perform SVD on TF-IDF matrices
    svd_tfidf = TruncatedSVD(n_components=10)
    U_text = svd_tfidf.fit_transform(tfidf_matrix_text)
    U_title = svd_tfidf.transform(tfidf_matrix_title)
    
    # LDA Analysis
    dtm = 2 * vectorized_title + vectorized_text  # Reuse weighted document-term matrix
    dtm_sparse = csr_matrix(dtm)  # Convert to sparse format for gensim
    corpus = Sparse2Corpus(dtm_sparse, documents_columns=False)  # Convert to gensim corpus
    dictionary = Dictionary.from_corpus(corpus, id2word=dict((id, word) for word, id in cv.vocabulary_.items()))
    
    # Build LDA model
    lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=10, passes=10, random_state=10)
    
    # Collect top words for LDA
    lda_output = "Top words associated with each topic (LDA):\n"
    topics = lda_model.show_topics(num_words=10, formatted=False)
    for topic_num, topic_words in topics:
        words = ', '.join([word for word, _ in topic_words])
        lda_output += f"Topic {topic_num + 1}: {words}\n"

    # Collect outputs for Count Vectorization and TF-IDF Vectorization
    count_vectorization_output = ""
    tfidf_vectorization_output = ""
    
    # Collect top words for Count Vectorization
    count_vectorization_output += "Top words associated with each topic (Count Vectorization):\n"
    feature_names_count = cv.get_feature_names_out()
    for topic_idx, topic in enumerate(VT):
        top_words_idx = topic.argsort()[::-1][:10]  # Get indices of top 10 words
        top_words = [feature_names_count[i] for i in top_words_idx]
        count_vectorization_output += f"Topic {topic_idx+1}: {', '.join(top_words)}\n"
    
    # Collect top words for TF-IDF Vectorization
    tfidf_vectorization_output += "\nTop words associated with each topic (TF-IDF Vectorization):\n"
    feature_names_tfidf = tfidf.get_feature_names_out()
    for topic_idx, topic in enumerate(svd_tfidf.components_):
        top_words_idx = topic.argsort()[::-1][:10]  # Get indices of top 10 words
        top_words = [feature_names_tfidf[i] for i in top_words_idx]
        tfidf_vectorization_output += f"Topic {topic_idx+1}: {', '.join(top_words)}\n"    
    
    # Return all the collected outputs
    return count_vectorization_output, tfidf_vectorization_output, lda_output

In [None]:
def run_and_log_analyses(min_df_values, max_df_values, datasets, output_file_name):
    with open(output_file_name, "w") as file:
        for dataset_name, data in datasets.items():
            file.write(f"Analyzing dataset: {dataset_name}\n\n")
            for min_df in min_df_values:
                for max_df in max_df_values:
                    # Call analyze_text_data and unpack the returned outputs
                    count_vectorization_output, tfidf_vectorization_output, lda_output = analyze_text_data(data, min_df, max_df)
                    
                    file.write(f"min_df: {min_df}, max_df: {max_df}\n")
                    file.write(count_vectorization_output + "\n")
                    file.write(tfidf_vectorization_output + "\n")
                    file.write(lda_output + "\n\n")
            file.write("\n")  # Separator between datasets
    print(f"Analysis completed. Output written to {output_file_name}")

In [None]:
min_df_values = [0.005, 0.01, 0.05, 0.1]
max_df_values = [0.2, 0.3]

min_df_values_2 = [0.005, 0.01, 0.05, 0.1]
max_df_values_2 = [0.4, 0.5, 0.6]

datasets = {
    "joint_data": joint_data
}

run_and_log_analyses(min_df_values, max_df_values, datasets, "topics_analysis_output/analysis_output.txt")

# Training LDA model with the parameters we decide

In [None]:
def analyze_text_with_lda(data, min_df, max_df):
    # Initialize CountVectorizer
    cv = CountVectorizer(ngram_range=(1, 2), lowercase=False, min_df=min_df, max_df=max_df)
    
    # Fit CountVectorizer and transform text data
    cv.fit(data.text_preproc)
    vectorized_text = cv.transform(data.text_preproc).toarray()
    vectorized_title = cv.transform(data.titles_preproc).toarray()

    # Combine text and title vectorized data with specified weight
    dtm = 2 * vectorized_title + vectorized_text

    # Convert the combined DTM to a sparse format for gensim
    dtm_sparse = csr_matrix(dtm)

    # Convert sparse matrix to gensim corpus
    corpus = Sparse2Corpus(dtm_sparse, documents_columns=False)

    # Create a Gensim dictionary from the CountVectorizer vocabulary
    dictionary = Dictionary()
    cv_vocabulary = {id_: token for token, id_ in cv.vocabulary_.items()}
    # Manually populate the Dictionary object
    dictionary.id2token = cv_vocabulary
    dictionary.token2id = {token: id_ for id_, token in cv_vocabulary.items()}

    # Hardcoded parameters for LDA
    num_topics = 10
    passes = 10
    random_state = 10

    # Build LDA model
    lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, passes=passes, random_state=random_state)

    # Display the topics
    print("Top words associated with each topic (LDA):")
    topics = lda_model.show_topics(num_words=10, formatted=False)
    for topic_num, topic_words in topics:
        words = ', '.join([word for word, _ in topic_words])
        print(f"Topic {topic_num + 1}: {words}")

    return lda_model, corpus, cv


In [None]:
lda_model_joint, corpus_joint, cv = analyze_text_with_lda(joint_data, 0.05, 0.5)