In [1]:
import re
import matplotlib.pyplot as plt
from collections import Counter

from google.colab import drive
drive.mount('/content/drive')



# مسیر فایل در گوگل درایو
input_file = '/content/drive/My Drive/test1/article_preprint.txt'
output_chart = '/content/drive/My Drive/test1/sample_comm_cites.jpg'
output_report = '/content/drive/My Drive/test1/sample_refs_report.txt'



# Read the article text
with open(input_file, "r", encoding="utf-8") as file:
    text = file.read()

# Regular expression patterns
name_date_pattern = re.compile(r"\((?:e\.g\.,\s*)?([A-Z][a-z]+(?:\s(?:et\sal\.|&\s[A-Z][a-z]+)?)?,\s\d{4})")
date_only_pattern = re.compile(r"([A-Z][a-z]+(?:\s(?:et\sal\.|&\s[A-Z][a-z]+)?)?)\s\(\d{4}\)")

# Extract citations
name_date_citations = name_date_pattern.findall(text)
date_only_matches = date_only_pattern.findall(text)

# Count occurrences of citations
citation_counts = Counter(name_date_citations)

# Sort by most common
most_common_citations = citation_counts.most_common(20)

# Plot bar chart
fig, ax = plt.subplots(figsize=(10, 6))
labels, values = zip(*most_common_citations)
ax.barh(labels[::-1], values[::-1])  # Reverse order for descending order
ax.set_xlabel("Frequency")
ax.set_title("Most Common Citations")
plt.tight_layout()
plt.savefig(output_chart)
plt.close()

# Unique citations list
unique_name_date_citations = sorted(set(name_date_citations))
unique_date_only_citations = sorted(set(date_only_matches))

# Save reference report
with open(output_report, "w", encoding="utf-8") as report:
    report.write("Citations with Names and Dates in Parentheses:\n")
    report.write("\n".join(unique_name_date_citations) + "\n\n")
    report.write("Citations with Only Dates in Parentheses:\n")
    report.write("\n".join(unique_date_only_citations) + "\n")

print("Citation analysis complete. Outputs saved.")


Mounted at /content/drive
Citation analysis complete. Outputs saved.
