In [None]:
# Import the os module to check if files exist before reading
import os  

def count_lines_words(filename):
    """
    Reads a text file and counts the number of lines and words.

    Parameters:
    filename (str): The path to the file.

    Returns:
    tuple: (number_of_lines, number_of_words) in the file.
    """
    try:
        # Open the file in read mode with UTF-8 encoding
        with open(filename, 'r', encoding='utf-8') as file:
            lines = file.readlines()  # Read all lines into a list
            words = ' '.join(lines).split()  # Join lines into a string, then split into words

        # Return the count of lines and words
        return len(lines), len(words)

    except FileNotFoundError:
        # If the file is missing, print an error message and return (0, 0)
        print(f"Error: {filename} not found.")
        return 0, 0

# List of file paths to check
files = [
    './data/obama_speech.txt',
    './data/michelle_obama_speech.txt',
    './data/donald_speech.txt',
    './data/melina_trump_speech.txt'
]

# Loop through each file in the list
for file in files:
    # Check if the file exists before attempting to open it
    if os.path.exists(file):
        # Call the function to count lines and words
        lines, words = count_lines_words(file)
        # Print the results
        print(f"{file}: {lines} lines, {words} words")
    else:
        # If the file does not exist, print an error message
        print(f"File not found: {file}")


File not found: ./data/obama_speech.txt
File not found: ./data/michelle_obama_speech.txt
File not found: ./data/donald_speech.txt
File not found: ./data/melina_trump_speech.txt


In [None]:
import json  # Import the JSON module to handle JSON files

def most_populated_countries(filename, top_n):
    """
    Reads a JSON file containing country data and returns the top N most populated countries.
    
    Parameters:
    filename (str): Path to the JSON file.
    top_n (int): Number of top populated countries to return.

    Returns:
    list: A list of dictionaries containing country names and their populations.
    """
    try:
        # Open and read the JSON file
        with open(filename, 'r', encoding='utf-8') as file:
            countries = json.load(file)  # Load the JSON data into a Python list

        # Sort the list of countries based on the 'population' key in descending order
        sorted_countries = sorted(countries, key=lambda x: x['population'], reverse=True)

        # Extract the top N populated countries and return as a list of dictionaries
        return [{'country': country['name'], 'population': country['population']} for country in sorted_countries[:top_n]]

    except FileNotFoundError:
        # Handle the case when the file is not found
        return f"File not found: {filename}"

# Example usage: Finding the 10 most populated countries
print(most_populated_countries('./data/countries_data.json', 10))

# Example usage: Finding the 3 most populated countries
print(most_populated_countries('./data/countries_data.json', 3))


File not found: ./data/countries_data.json
File not found: ./data/countries_data.json


In [8]:
import json  # Import the JSON module to handle JSON files

def most_populated_countries(filename, top_n):
    """
    Reads a JSON file containing country data and returns the top N most populated countries.
    
    Parameters:
    filename (str): Path to the JSON file.
    top_n (int): Number of top populated countries to return.

    Returns:
    list: A list of dictionaries containing country names and their populations.
    """
    try:
        # Open and read the JSON file
        with open(filename, 'r', encoding='utf-8') as file:
            countries = json.load(file)  # Load the JSON data into a Python list

        # Sort the list of countries based on the 'population' key in descending order
        sorted_countries = sorted(countries, key=lambda x: x['population'], reverse=True)

        # Extract the top N populated countries and return as a list of dictionaries
        return [{'country': country['name'], 'population': country['population']} for country in sorted_countries[:top_n]]

    except FileNotFoundError:
        # Handle the case when the file is not found
        return f"File not found: {filename}"

# Example usage: Finding the 10 most populated countries
print(most_populated_countries('./data/countries_data.json', 10))

# Example usage: Finding the 3 most populated countries
print(most_populated_countries('./data/countries_data.json', 3))


File not found: ./data/countries_data.json
File not found: ./data/countries_data.json


Extract All Incoming Email Addresses
We will extract email addresses from email_exchange_big.txt using regular expressions.

In [9]:
import re

def extract_emails(filename):
    """
    Extracts all email addresses from a given file.
    
    Parameters:
    filename (str): The path to the file.

    Returns:
    list: A list of unique email addresses.
    """
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read()
        
        # Regular expression for extracting emails
        emails = re.findall(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}', text)
        
        return list(set(emails))  # Return unique emails
    
    except FileNotFoundError:
        return f"File not found: {filename}"

# Example usage
emails = extract_emails('./data/email_exchange_big.txt')
print(emails)


File not found: ./data/email_exchange_big.txt


Find the Most Common Words
We will count word frequencies and return the n most common words.

In [10]:
from collections import Counter
import re

def find_most_common_words(filename, n):
    """
    Finds the most common words in a file.
    
    Parameters:
    filename (str): The file to analyze.
    n (int): Number of most common words to return.

    Returns:
    list: A list of tuples containing (frequency, word).
    """
    try:
        with open(filename, 'r', encoding='utf-8') as file:
            text = file.read().lower()
        
        # Remove punctuation and split words
        words = re.findall(r'\b[a-zA-Z]+\b', text)
        
        # Count word frequencies
        word_counts = Counter(words)
        
        return word_counts.most_common(n)

    except FileNotFoundError:
        return f"File not found: {filename}"

# Example usage
print(find_most_common_words('./data/sample.txt', 10))
print(find_most_common_words('./data/sample.txt', 5))


File not found: ./data/sample.txt
File not found: ./data/sample.txt


Find the Most Frequent Words in the Speeches
We reuse find_most_common_words() for different speech files.


In [None]:
# Find the most frequent words in speeches
print("Obama's Speech:", find_most_common_words('./data/obama_speech.txt', 10))
print("Michelle's Speech:", find_most_common_words('./data/michelle_obama_speech.txt', 10))
print("Trump's Speech:", find_most_common_words('./data/donald_speech.txt', 10))
print("Melina's Speech:", find_most_common_words('./data/melina_trump_speech.txt', 10))


Check Similarity Between Two Texts
We will:

Clean the text (remove special characters and make lowercase)

Remove stop words (common words that don’t add meaning)

Compare text similarity (Jaccard similarity)


In [None]:
def clean_text(text):
    """Cleans text by removing punctuation and converting to lowercase."""
    text = re.sub(r'[^a-zA-Z\s]', '', text).lower()
    return text.split()

def remove_support_words(words, stop_words_file):
    """Removes stop words from a list of words."""
    try:
        with open(stop_words_file, 'r', encoding='utf-8') as file:
            stop_words = set(file.read().split())

        return [word for word in words if word not in stop_words]

    except FileNotFoundError:
        return words  # If stop words file is missing, return the original list

def check_text_similarity(file1, file2, stop_words_file):
    """
    Checks the similarity between two texts using Jaccard similarity.
    
    Parameters:
    file1 (str): Path to first text file.
    file2 (str): Path to second text file.
    stop_words_file (str): Path to stop words file.

    Returns:
    float: Similarity percentage.
    """
    try:
        with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
            words1 = clean_text(f1.read())
            words2 = clean_text(f2.read())

        words1 = set(remove_support_words(words1, stop_words_file))
        words2 = set(remove_support_words(words2, stop_words_file))

        # Jaccard similarity: |A ∩ B| / |A ∪ B|
        similarity = len(words1 & words2) / len(words1 | words2)

        return round(similarity * 100, 2)  # Convert to percentage

    except FileNotFoundError:
        return "One of the files not found."

# Example: Checking similarity between Michelle's and Melina's speeches
print(check_text_similarity('./data/michelle_obama_speech.txt', './data/melina_trump_speech.txt', './data/stop_words.txt'))


Find the 10 Most Repeated Words in romeo_and_juliet.txt
We can use the find_most_common_words() function.

In [11]:
print(find_most_common_words('./data/romeo_and_juliet.txt', 10))


File not found: ./data/romeo_and_juliet.txt


Analyze the Hacker News CSV File
We need to count lines containing specific keywords.

In [12]:
import csv

def count_keyword_occurrences(filename, keyword, exclude=None):
    """
    Counts the number of lines containing a keyword in a CSV file.
    
    Parameters:
    filename (str): Path to the CSV file.
    keyword (str): The keyword to search for.
    exclude (str, optional): A word to exclude if present in the line.

    Returns:
    int: The count of matching lines.
    """
    try:
        count = 0
        with open(filename, 'r', encoding='utf-8') as file:
            reader = csv.reader(file)
            for row in reader:
                line = ' '.join(row).lower()
                if keyword.lower() in line:
                    if exclude and exclude.lower() in line:
                        continue
                    count += 1
        return count
    except FileNotFoundError:
        return f"File not found: {filename}"

# Count occurrences
python_count = count_keyword_occurrences('./data/hacker_news.csv', 'python')
javascript_count = count_keyword_occurrences('./data/hacker_news.csv', 'javascript')
java_count = count_keyword_occurrences('./data/hacker_news.csv', 'java', exclude='javascript')

print(f"Lines containing 'Python': {python_count}")
print(f"Lines containing 'JavaScript': {javascript_count}")
print(f"Lines containing 'Java' but NOT 'JavaScript': {java_count}")


Lines containing 'Python': File not found: ./data/hacker_news.csv
Lines containing 'JavaScript': File not found: ./data/hacker_news.csv
Lines containing 'Java' but NOT 'JavaScript': File not found: ./data/hacker_news.csv
