In [1]:
import json


In [2]:
def most_spoken_languages(filename, n):
    """
    Reads a JSON file, counts occurrences of each language,
    and returns the n most spoken languages in descending order.
    """
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    language_counts = {}
    for country in data:
        for lang in country['languages']:
            if lang in language_counts:
                language_counts[lang] += 1
            else:
                language_counts[lang] = 1
    
    sorted_languages = []
    for key, value in language_counts.items():
        sorted_languages.append((value, key))
    sorted_languages.sort(reverse=True)
    
    return sorted_languages[:n]

print(most_spoken_languages("./data/countries_data.json", 10))

[(91, 'English'), (45, 'French'), (25, 'Arabic'), (24, 'Spanish'), (9, 'Russian'), (9, 'Portuguese'), (8, 'Dutch'), (7, 'German'), (5, 'Chinese'), (4, 'Swahili')]


In [3]:
def most_populated_countries(filename, n):
    """
    Reads a JSON file, sorts countries by population,
    and returns the n most populated countries.
    """
    with open(filename, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    for i in range(len(data)):
        for j in range(i + 1, len(data)):
            if data[i]['population'] < data[j]['population']:
                data[i], data[j] = data[j], data[i]
    
    result = []
    for i in range(n):
        result.append({"country": data[i]['name'], "population": data[i]['population']})
    
    return result

print(most_populated_countries("./data/countries_data.json", 10))


[{'country': 'China', 'population': 1377422166}, {'country': 'India', 'population': 1295210000}, {'country': 'United States of America', 'population': 323947000}, {'country': 'Indonesia', 'population': 258705000}, {'country': 'Brazil', 'population': 206135893}, {'country': 'Pakistan', 'population': 194125062}, {'country': 'Nigeria', 'population': 186988000}, {'country': 'Bangladesh', 'population': 161006790}, {'country': 'Russian Federation', 'population': 146599183}, {'country': 'Japan', 'population': 126960000}]


In [11]:

def check_text_similarity(file1, file2, stop_words_file):
    """
    Computes the Jaccard similarity between two text files after removing stop words.
    """
    with open("Data/stop_words.txt", 'r', encoding='utf-8') as file:
        stop_words = set(file.read().splitlines())
    
    def clean_text(text):
        for char in "!@#$%^&*()_+-=<>?/.,;:'\"{}[]|\\":
            text = text.replace(char, " ")
        words = text.lower().split()
        cleaned_words = []
        for word in words:
            if word not in stop_words:
                cleaned_words.append(word)
        return cleaned_words
    
    def jaccard_similarity(text1, text2):
        set1, set2 = set(text1), set(text2)
        intersection = len(set1 & set2)
        union = len(set1 | set2)
        return intersection / union
    file1= "Data/michelle_obama_speech.txt"
    file2="Data/melina_trump_speech.txt"
    with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
        words1, words2 = clean_text(f1.read()), clean_text(f2.read())
    
    return jaccard_similarity(words1, words2)

similarity = check_text_similarity("michelle_obama_speech.txt", "melina_trump_speech.txt", "stop_words.txt")
print(f"Text Similarity Score: {similarity:.2f}")


Text Similarity Score: 0.25
