import matplotlib.pyplot as plt

# Compute token counts for both tokenization methods
df["simple_token_count"] = df["title_tokens_simple"].apply(len)
df["hf_token_count"] = df["title_tokens_hf"].apply(len)

# Create a summary table
summary_table = pd.DataFrame({
    "Method": ["Simple Tokenizer", "Hugging Face Tokenizer"],
    "Average Tokens": [df["simple_token_count"].mean(), df["hf_token_count"].mean()],
    "Min Tokens": [df["simple_token_count"].min(), df["hf_token_count"].min()],
    "Max Tokens": [df["simple_token_count"].max(), df["hf_token_count"].max()]
})

print(summary_table)


In [None]:
# Plot token count distributions
plt.figure(figsize=(12, 6))
plt.hist(df["simple_token_count"], bins=30, alpha=0.6, label="Simple Tokenizer", color="blue")
plt.hist(df["hf_token_count"], bins=30, alpha=0.6, label="Hugging Face Tokenizer", color="orange")
plt.xlabel("Token Count")
plt.ylabel("Frequency")
plt.title("Distribution of Token Counts")
plt.legend()
plt.show()


In [None]:
# Select a random sample of 10 rows
sample = df.sample(10, random_state=42)

# Plot token counts for the sample
x = range(len(sample))
width = 0.35

plt.figure(figsize=(12, 6))
plt.bar(x, sample["simple_token_count"], width=width, label="Simple Tokenizer", color="blue", alpha=0.6)
plt.bar([i + width for i in x], sample["hf_token_count"], width=width, label="Hugging Face Tokenizer", color="orange", alpha=0.6)
plt.xlabel("Sample Rows")
plt.ylabel("Token Count")
plt.title("Token Counts for Sample Rows")
plt.xticks([i + width / 2 for i in x], sample.index, rotation=90)
plt.legend()
plt.tight_layout()
plt.show()


In [None]:
# Compute token count differences
df["token_count_difference"] = df["hf_token_count"] - df["simple_token_count"]

# Plot token count differences
plt.figure(figsize=(12, 6))
plt.hist(df["token_count_difference"], bins=30, color="purple", alpha=0.7)
plt.axvline(df["token_count_difference"].mean(), color="red", linestyle="--", label="Mean Difference")
plt.xlabel("Difference in Token Count (HF - Simple)")
plt.ylabel("Frequency")
plt.title("Difference in Token Counts Between Tokenizers")
plt.legend()
plt.show()


In [None]:
from collections import Counter

# Flatten token lists and calculate frequencies
simple_token_flat = [token for tokens in df["title_tokens_simple"] for token in tokens]
hf_token_flat = [token for tokens in df["title_tokens_hf"] for token in tokens]

# Get top 20 tokens by frequency
simple_token_freq = Counter(simple_token_flat).most_common(20)
hf_token_freq = Counter(hf_token_flat).most_common(20)

# Plot token frequency for Simple Tokenizer
plt.figure(figsize=(12, 6))
simple_tokens, simple_freqs = zip(*simple_token_freq)
plt.bar(simple_tokens, simple_freqs, color="blue", alpha=0.7, label="Simple Tokenizer")
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.title("Top 20 Tokens by Frequency (Simple Tokenizer)")
plt.xticks(rotation=45, ha="right")
plt.legend()
plt.show()

# Plot token frequency for Hugging Face Tokenizer
plt.figure(figsize=(12, 6))
hf_tokens, hf_freqs = zip(*hf_token_freq)
plt.bar(hf_tokens, hf_freqs, color="orange", alpha=0.7, label="Hugging Face Tokenizer")
plt.xlabel("Tokens")
plt.ylabel("Frequency")
plt.title("Top 20 Tokens by Frequency (Hugging Face Tokenizer)")
plt.xticks(rotation=45, ha="right")
plt.legend()
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Join tokens for TF-IDF analysis
df["simple_text"] = df["title_tokens_simple"].apply(lambda tokens: " ".join(tokens))
df["hf_text"] = df["title_tokens_hf"].apply(lambda tokens: " ".join(tokens))

# TF-IDF for Simple Tokenizer
tfidf_simple = TfidfVectorizer(max_features=20)
simple_tfidf_matrix = tfidf_simple.fit_transform(df["simple_text"])
simple_tfidf_df = pd.DataFrame(simple_tfidf_matrix.toarray(), columns=tfidf_simple.get_feature_names_out())

# TF-IDF for Hugging Face Tokenizer
tfidf_hf = TfidfVectorizer(max_features=20)
hf_tfidf_matrix = tfidf_hf.fit_transform(df["hf_text"])
hf_tfidf_df = pd.DataFrame(hf_tfidf_matrix.toarray(), columns=tfidf_hf.get_feature_names_out())

# TF-IDF for SpaCy fields
df["spacy_abstract_text"] = df["disease_abstract_spacy"].apply(lambda entities: " ".join(entities))
df["spacy_title_text"] = df["disease_title_spacy"].apply(lambda entities: " ".join(entities))

tfidf_spacy_abstract = TfidfVectorizer(max_features=20)
spacy_abstract_matrix = tfidf_spacy_abstract.fit_transform(df["spacy_abstract_text"])
spacy_abstract_tfidf_df = pd.DataFrame(spacy_abstract_matrix.toarray(), columns=tfidf_spacy_abstract.get_feature_names_out())

tfidf_spacy_title = TfidfVectorizer(max_features=20)
spacy_title_matrix = tfidf_spacy_title.fit_transform(df["spacy_title_text"])
spacy_title_tfidf_df = pd.DataFrame(spacy_title_matrix.toarray(), columns=tfidf_spacy_title.get_feature_names_out())

In [None]:
print("TF-IDF Matrix (SpaCy Abstract):")
spacy_abstract_tfidf_df.head()

In [None]:
print("\nTF-IDF Matrix (SpaCy Title):")
spacy_title_tfidf_df.head()

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Word clouds for tokenized data
simple_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(simple_token_flat))
hf_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(hf_token_flat))

# Word clouds for SpaCy entities
spacy_abstract_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(df["spacy_abstract_text"]))
spacy_title_wordcloud = WordCloud(width=800, height=400, background_color="white").generate(" ".join(df["spacy_title_text"]))

# Plotting
plt.figure(figsize=(12, 6))
plt.imshow(simple_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: Simple Tokenizer")
plt.show()

plt.figure(figsize=(12, 6))
plt.imshow(hf_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: Hugging Face Tokenizer")
plt.show()

plt.figure(figsize=(12, 6))
plt.imshow(spacy_abstract_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: SpaCy Abstract Entities")
plt.show()

plt.figure(figsize=(12, 6))
plt.imshow(spacy_title_wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("Word Cloud: SpaCy Title Entities")
plt.show()


In [None]:
# Ensure the 'parsed_date' is datetime
df["year"] = df["parsed_date"].dt.year

# Flatten and group entities by year
abstract_entities_by_year = df.groupby("year")["disease_abstract_spacy"].apply(lambda x: [entity for entities in x for entity in entities])
title_entities_by_year = df.groupby("year")["disease_title_spacy"].apply(lambda x: [entity for entities in x for entity in entities])

# Count top entities per year
abstract_top_entities_by_year = abstract_entities_by_year.apply(lambda entities: Counter(entities).most_common(5))
title_top_entities_by_year = title_entities_by_year.apply(lambda entities: Counter(entities).most_common(5))

print("Top Entities by Year (Abstract):")
print(abstract_top_entities_by_year)

print("\nTop Entities by Year (Title):")
print(title_top_entities_by_year)


In [None]:
# Calculate overlap ratio
df["overlap_ratio"] = df.apply(
    lambda row: len(set(row["disease_title_spacy"]) & set(row["disease_abstract_spacy"])) / max(1, len(set(row["disease_title_spacy"]) | set(row["disease_abstract_spacy"]))),
    axis=1,
)

# Average overlap by year
overlap_by_year = df.groupby(df["parsed_date"].dt.year)["overlap_ratio"].mean()

# Plot overlap
plt.figure(figsize=(12, 6))
plt.plot(overlap_by_year.index, overlap_by_year, marker="o", color="purple", label="Title-Abstract Overlap")
plt.xlabel("Year")
plt.ylabel("Average Overlap Ratio")
plt.title("Title-Abstract Entity Overlap Over the Years")
plt.grid(alpha=0.3)
plt.legend()
plt.show()


In [None]:
from collections import Counter

# Group by year for abstract entities
abstract_entities_by_year = df.groupby(df["parsed_date"].dt.year)["disease_abstract_spacy"].apply(
    lambda x: [entity for entities in x for entity in entities]
)
abstract_counts_by_year = abstract_entities_by_year.apply(lambda entities: Counter(entities))

# Define topics of interest
topics = ["tumor", "cancer", "infection", "diabetes", "pain"]

# Create a DataFrame for abstract trends
abstract_topic_trends = pd.DataFrame({
    topic: abstract_counts_by_year.apply(lambda counts: counts.get(topic, 0)) for topic in topics
})
abstract_topic_trends.index.name = "Year"

# Plot abstract trends
plt.figure(figsize=(12, 6))
for topic in topics:
    plt.plot(abstract_topic_trends.index, abstract_topic_trends[topic], label=topic)

plt.xlabel("Year")
plt.ylabel("Frequency")
plt.title("Abstract Topic Trends Over the Years")
plt.legend(title="Topics")
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Check overlap between title and abstract entities
df["entity_overlap"] = df.apply(
    lambda row: len(set(row["disease_title_spacy"]) & set(row["disease_abstract_spacy"])),
    axis=1,
)

# Value counts for the overlap
overlap_counts = df["entity_overlap"].value_counts()
print("Entity Overlap Between Title and Abstract:")
print(overlap_counts)

# Visualize overlap
plt.figure(figsize=(10, 6))
overlap_counts.plot(kind="bar", color="skyblue")
plt.xlabel("Number of Overlapping Entities")
plt.ylabel("Frequency")
plt.title("Overlap Between Title and Abstract Entities")
plt.xticks(rotation=0)
plt.show()


In [None]:
# Flatten entities for titles and abstracts
title_entity_counts = Counter(
    [entity for entities in df["disease_title_spacy"] for entity in entities]
)
abstract_entity_counts = Counter(
    [entity for entities in df["disease_abstract_spacy"] for entity in entities]
)

# Create DataFrames for better visualization
title_counts_df = pd.DataFrame.from_dict(title_entity_counts, orient="index", columns=["count"]).sort_values(by="count", ascending=False)
abstract_counts_df = pd.DataFrame.from_dict(abstract_entity_counts, orient="index", columns=["count"]).sort_values(by="count", ascending=False)

print("Value Counts Table for Title Entities:")
print(title_counts_df.head(100))

print("\nValue Counts Table for Abstract Entities:")
print(abstract_counts_df.head(100))


In [None]:
from collections import Counter

# Flatten entities for titles and abstracts
title_entity_counts = Counter([entity for entities in df["disease_title_spacy"] for entity in entities])
abstract_entity_counts = Counter([entity for entities in df["disease_abstract_spacy"] for entity in entities])

# Convert to DataFrame for visualization
title_counts_df = pd.DataFrame.from_dict(title_entity_counts, orient="index", columns=["count"]).sort_values(by="count", ascending=False)
abstract_counts_df = pd.DataFrame.from_dict(abstract_entity_counts, orient="index", columns=["count"]).sort_values(by="count", ascending=False)

print("Top 10 Title Entities by Count:")
print(title_counts_df.head(10))

print("\nTop 10 Abstract Entities by Count:")
print(abstract_counts_df.head(10))


In [None]:
# Convert set to list for indexing
top_entities_list = list(top_entities)

# Combine counts for the top entities
combined_counts = pd.DataFrame({
    "title": title_counts_df.loc[top_entities_list]["count"].fillna(0),
    "abstract": abstract_counts_df.loc[top_entities_list]["count"].fillna(0),
}).sort_values(by=["title", "abstract"], ascending=False)

# Stacked bar chart
combined_counts.plot(kind="bar", stacked=True, figsize=(12, 6), color=["blue", "orange"])
plt.title("Entity Frequencies (Titles vs. Abstracts)")
plt.xlabel("Entities")
plt.ylabel("Frequency")
plt.legend(title="Source")
plt.grid(alpha=0.3)
plt.show()


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare data for TF-IDF
df["title_text"] = df["disease_title_spacy"].apply(lambda x: " ".join(x))
df["abstract_text"] = df["disease_abstract_spacy"].apply(lambda x: " ".join(x))

# TF-IDF for Titles
tfidf_vectorizer = TfidfVectorizer(max_features=20)
tfidf_title_matrix = tfidf_vectorizer.fit_transform(df["title_text"])
tfidf_title_df = pd.DataFrame(tfidf_title_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

# TF-IDF for Abstracts
tfidf_abstract_matrix = tfidf_vectorizer.fit_transform(df["abstract_text"])
tfidf_abstract_df = pd.DataFrame(tfidf_abstract_matrix.toarray(), columns=tfidf_vectorizer.get_feature_names_out())

print("TF-IDF (Titles):")
print(tfidf_title_df.head())

print("\nTF-IDF (Abstracts):")
print(tfidf_abstract_df.head())


In [None]:
# Group by year for title entities
title_entities_by_year = df.groupby(df["parsed_date"].dt.year)["disease_title_spacy"].apply(
    lambda x: [entity for entities in x for entity in entities]
)
title_counts_by_year = title_entities_by_year.apply(lambda entities: Counter(entities))

# Create a DataFrame for title trends
title_topic_trends = pd.DataFrame({
    topic: title_counts_by_year.apply(lambda counts: counts.get(topic, 0)) for topic in topics
})
title_topic_trends.index.name = "Year"

# Plot title trends
plt.figure(figsize=(12, 6))
for topic in topics:
    plt.plot(title_topic_trends.index, title_topic_trends[topic], label=topic)

plt.xlabel("Year")
plt.ylabel("Frequency")
plt.title("Title Topic Trends Over the Years")
plt.legend(title="Topics")
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Monthly trends for top terms
df["month_year"] = df["parsed_date"].dt.to_period("M")  # Group by month
monthly_entities = df.groupby("month_year")["disease_abstract_spacy"].apply(lambda x: [entity for entities in x for entity in entities])
monthly_counts = monthly_entities.apply(lambda entities: Counter(entities))

# Extract frequencies for top entities
monthly_top_entities = ["tumor", "cancer", "infection", "pain"]
monthly_trends = pd.DataFrame({
    term: monthly_counts.apply(lambda counts: counts.get(term, 0)) for term in monthly_top_entities
})

# Plot monthly trends
monthly_trends.plot(figsize=(12, 6), marker="o")
plt.title("Monthly Trends for Top Abstract Entities")
plt.xlabel("Month-Year")
plt.ylabel("Frequency")
plt.legend(title="Entities")
plt.grid(alpha=0.3)
plt.show()


In [None]:
# Convert set to list for indexing
top_entities_list = list(top_entities)

# Combine counts for the top entities
combined_counts = pd.DataFrame({
    "title": title_counts_df.loc[top_entities_list]["count"].fillna(0),
    "abstract": abstract_counts_df.loc[top_entities_list]["count"].fillna(0),
})

# Normalize to 100% by dividing each value by the row sum
combined_counts_normalized = combined_counts.div(combined_counts.sum(axis=1), axis=0) * 100

# Sort the normalized counts (optional)
combined_counts_normalized = combined_counts_normalized.sort_values(by=["title", "abstract"], ascending=False)

# Stacked bar chart with 100% scaling
combined_counts_normalized.plot(kind="bar", stacked=True, figsize=(12, 6), color=["blue", "orange"])
plt.title("Entity Frequencies (Titles vs. Abstracts) - 100% Stacked")
plt.xlabel("Entities")
plt.ylabel("Percentage (%)")
plt.legend(title="Source")
plt.grid(alpha=0.3)
plt.show()
