In [149]:
"""
Simple Program that reads a text file and prints 
some properties from it.

Author: Oscar Lopez
"""

import urllib.request


class TextFunctions:
    def __init__(self):
        """
        Main constructor of the program.
        """
        self.data = []
        
    
    def open_file(self, url):
        """
        Opens a file from a given url and appends raw data as ascii data to data.
        """
        raw_data = urlopen(url)
        for line in raw_data:
            self.data.append(line.decode('ascii').strip().split(" "))
            
            
    def get_num_words(self):
        """
        Returns the number of words of the file.
        """
        num_words = 0
        for sentence in self.data:
            for word in sentence:
                if word:
                    num_words += 1
        return num_words
    
    
    def get_num_lines(self):
        """
        Returns the number of lines in the data file.
        """
        return len(self.data)

    
    def get_num_chars(self):
        """
        Returns the number of characters in the data file.
        """
        num_chars = 0
        for sentence in self.data:
            for word in sentence:
                for char in word:
                    num_chars += 1
        return num_chars
                    
    
    def get_num_sentences(self):
        """
        Returns the number of sentences in the file.
        """
        num_sentences = 0
        punctuation = ['.', '?', '!']
        for sentence in self.data:
            for word in sentence:
                for char in word: 
                    if char in punctuation:
                        num_sentences += 1
        return num_sentences
    

    def get_word_frequency(self):
        """
        Returns the frequency of words in the data file.
        """
        freq = {}
        for sentence in self.data:
            for word in sentence:
                if word not in freq:
                    freq[word] = 1
                else:
                    freq[word] += 1
        return freq

    
    def get_top_50_frequency(self, freq_dict):
        """
        Returns the top_50 word frequency list.
        """
        frequency_list = []
        top_50_freq = []
        for word, freq_value in freq_dict.items():
            frequency_list.append((freq_value, word))
        frequency_list.sort(reverse=True)
        for i in range(50):
            top_50_freq.append((frequency_list[i][0], frequency_list[i][1]))
        return top_50_freq
    
    
""" Main Program"""
    
t = TextFunctions()
t.open_file("https://www.gutenberg.org/files/2701/2701.txt")
print(f'Number of words: %d'%t.get_num_words())
print(f'Number of lines: %d'%t.get_num_lines())
print(f'Number of characters: %d'%t.get_num_chars())
print(f'Number of sentences: %d'%t.get_num_sentences())

print('\nTop 50 Word Frequencies:\n ')
for position, item in enumerate(t.get_top_50_frequency(t.get_word_frequency())):
    print(f'%d : %s, %f'%(position + 1, item[1], item[0]/t.get_num_words() * 100))

Number of words: 214435
Number of lines: 23870
Number of characters: 1008115
Number of sentences: 10269

Top 50 Word Frequencies:
 
1 : the, 6.408935
2 : , 3.150139
3 : of, 3.048010
4 : and, 2.773801
5 : a, 2.110197
6 : to, 2.099937
7 : in, 1.797281
8 : that, 1.261455
9 : his, 1.132744
10 : I, 0.809103
11 : with, 0.771329
12 : as, 0.747080
13 : is, 0.741483
14 : was, 0.729359
15 : it, 0.714902
16 : he, 0.702311
17 : for, 0.645417
18 : all, 0.608576
19 : at, 0.573134
20 : this, 0.546553
21 : by, 0.522769
22 : from, 0.496188
23 : but, 0.485462
24 : not, 0.481731
25 : be, 0.459347
26 : on, 0.429501
27 : so, 0.370275
28 : you, 0.362814
29 : one, 0.356751
30 : have, 0.350223
31 : had, 0.350223
32 : or, 0.335766
33 : were, 0.300324
34 : But, 0.299391
35 : their, 0.286334
36 : an, 0.271411
37 : are, 0.268613
38 : some, 0.267680
39 : they, 0.267214
40 : my, 0.260219
41 : which, 0.259752
42 : him, 0.259752
43 : The, 0.256022
44 : like, 0.254623
45 : upon, 0.248094
46 : into, 0.241099
47 : when,