<a href="https://colab.research.google.com/github/mintycake420/Basic-Exercises-for-courses/blob/main/InformationRetreival_EX01_211718366.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Overview:**

This project uses the Wikipedia library to fetch 10 random Wikipedia articles which contain over 500 characters. It then extracts the article text, saves each article to a designated folder, cleans the text by removing any non-letter characters (numbers, punctuation, emojis), and finally uses Python‚Äôs built-in collections.Counter to compute the 50 most common words across all articles, along with their occurrence counts.

Submitted by:
Yotam Katz
Date: 06.11.2025

Course: ◊ê◊ó◊ñ◊ï◊® ◊û◊ô◊ì◊¢ 26 3700 ◊ê01

Lecturer: Dr. Moshe Friedman


ID of submitee: 211718366


Email: Yotamkatz2000@gmail.com

In [None]:
#Libraries and intregrations required:
!pip install wikipedia
import wikipedia
import re
from collections import Counter
import os
from google.colab import drive




In [None]:
#Cleans the text from non-letters and converts to lowercase
def clean_text(text):
    """Remove special characters and convert to lowercase"""
    # Remove everything except letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Split into words
    words = text.split()
    return words

In [None]:
def fetch_articles(num_articles=10):
    """Fetch articles from Wikipedia"""
    articles = []
    print("Fetching articles from Wikipedia...")

    attempts = 0
    max_attempts = num_articles * 3

    while len(articles) < num_articles and attempts < max_attempts:
        attempts += 1
        try:
            # Get a random page
            page_title = wikipedia.random(1)
            print(f"Trying: {page_title}")

            # Fetch the page
            page = wikipedia.page(page_title, auto_suggest=False)

            # Check if at least 500 charecters
            if len(page.content) > 500:
                articles.append({
                    'title': page.title,
                    'content': page.content
                })
                print(f"‚úì Added article {len(articles)}/{num_articles}: {page.title}")
            else:
                print(f"‚úó Article too short, trying another...")
                #Disambiguation pages are not articles but a list of articles
        except wikipedia.exceptions.DisambiguationError as e:
            print(f"‚úó Disambiguation page, skipping...")
        except wikipedia.exceptions.PageError:
            print(f"‚úó Page not found, skipping...")
        except Exception as e:
            print(f"‚úó Error: {e}")

    return articles


In [None]:
#Converts the fetched wikipedia articles into txt files and saves to local folder
def save_articles_to_files(articles, directory):
    """Save each article to a separate text file"""
    if not os.path.exists(directory):
        os.makedirs(directory)

    print(f"\nSaving articles to '{directory}' directory...")

    for i, article in enumerate(articles, 1):
        filename = f"{directory}/article_{i}.txt"
        with open(filename, 'w', encoding='utf-8') as f:
            f.write(f"Title: {article['title']}\n\n")
            f.write(article['content'])
        print(f"Saved: {filename}")


In [None]:
def analyze_word_frequency(articles, top_n=50):
    #Count word frequencies across all articles
    print("\nAnalyzing word frequencies...")
    all_words = []

    for article in articles:
        words = clean_text(article['content'])
        all_words.extend(words)

    word_counts = Counter(all_words)
    most_common = word_counts.most_common(top_n)

    return most_common, len(all_words)


In [None]:
#Save the most common words to a file, as well as the total number of words
def save_word_frequencies(most_common, total_words, filename):
    print(f"\nSaving word frequencies to '{filename}'...")

    with open(filename, 'w', encoding='utf-8') as f:
        f.write("50 MOST COMMON WORDS\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Total words analyzed: {total_words}\n\n")
        f.write(f"{'Rank':<6} {'Word':<20} {'Count':<10}\n")
        f.write("-" * 50 + "\n")

        for rank, (word, count) in enumerate(most_common, 1):
            f.write(f"{rank:<6} {word:<20} {count:<10}\n")

    print(f"Word frequencies saved to '{filename}'")

In [None]:
def main():
    print("=" * 60)
    print("Wikipedia Article Analyzer")
    print("=" * 60)

    # Mount Google Drive
    print("\nMounting Google Drive...")
    drive.mount('/content/drive')

    # Set the output path
    output_path = '/content/drive/MyDrive/Colab Notebooks/Information Retreival'

    # Create the directory if it doesn't exist
    os.makedirs(output_path, exist_ok=True)
    print(f"‚úì Output directory ready: {output_path}")

    # Fetch articles
    articles = fetch_articles(num_articles=10)


    if len(articles) == 0:
        print("\n‚ùå Failed to fetch any articles!")
        return

    # Save articles to Google Drive
    articles_dir = f"{output_path}/articles"
    save_articles_to_files(articles, directory=articles_dir)

    # Analyze word frequencies
    most_common, total_words = analyze_word_frequency(articles, top_n=50)

    # Save word frequencies to Google Drive
    freq_file = f"{output_path}/word_frequencies.txt"
    save_word_frequencies(most_common, total_words, filename=freq_file)

    print("\n" + "=" * 60)
    print("Analysis complete!")
    print("=" * 60)
    print("\nResults:")
    print(f"- {len(articles)} articles saved in '{articles_dir}'")
    print(f"- Word frequency analysis saved to '{freq_file}'")
    print(f"- Total words analyzed: {total_words:,}")
    print(f"\n‚úì All files saved to Google Drive!")
    print(f"üìÅ Location: {output_path}")

if __name__ == "__main__":
    main()

Wikipedia Article Analyzer

Mounting Google Drive...
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
‚úì Output directory ready: /content/drive/MyDrive/Colab Notebooks/Information Retreival
Fetching articles from Wikipedia...
Trying: Gimli
‚úó Disambiguation page, skipping...
Trying: Wally A. Forsberg
‚úì Added article 1/10: Wally A. Forsberg
Trying: William Smith (Archdeacon of Armagh)
‚úó Article too short, trying another...
Trying: Dulit frogmouth
‚úì Added article 2/10: Dulit frogmouth
Trying: 2015 Holiday Bowl
‚úì Added article 3/10: 2015 Holiday Bowl
Trying: Kavurmak√ºp√º, Ergani
‚úó Article too short, trying another...
Trying: Charles F. Baird
‚úì Added article 4/10: Charles F. Baird
Trying: Salas Cannonier
‚úì Added article 5/10: Salas Cannonier
Trying: Omer Newsome
‚úì Added article 6/10: Omer Newsome
Trying: 1898 in France
‚úì Added article 7/10: 1898 in France
Trying: Marie Pierre Adrien Francas