# Author recognition

## Prepare
 - Imports
 - Cleaning algorithm
 - Reading algorithm

In [1]:
import urllib.request
import re
import string

In [2]:
def cleaning(data):
    """Text cleaning: 
        lower the letters
        punctuation
        numbers to _NUM_ tokens"""
    formated_data = data.lower()
    #remove elements in string.punctation with regex
    formated_data = re.sub('['+string.punctuation+'\r'+'“”’'+']', '', formated_data)
    #replace 1..n number with regex
    formated_data = re.sub('['+string.digits+']+','_NUM_',formated_data)
    return formated_data

In [3]:
def reading(path, encoding='utf-8'):
    """Reading data from html pain text and cleaning it
    Inputs:
        path - html utl
        encoding
    Outputs:
        output - line seperated cleaned data"""
    data = urllib.request.urlopen(path).read()
    data_split = data.split(b'\n')
    
    data_cleaned = []
    for line in data_split:
        try:
            #encoding
            line = line.decode(encoding, 'backslashreplace')
            if encoding.lower() == 'utf-8' or encoding == '':
                #Detect wrong ' format:
                line = re.sub('\\\\x92','\'',line)
                #Other wrong character encodings:
                line = re.sub('\\\\(xa?\d.)*','?',line)
                line = re.sub('\ufeff','?',line)
                
            #cleaning    
            line = cleaning(line)
            line = line.strip()
            
            if line!='':
                data_cleaned.append(line)
                
        except UnicodeError:
            print(line)
    return data_cleaned

## Reading Inputs
Note: http://www.gutenberg.org sometimes ask for chapta

### Mark Twain
* The Project Gutenberg EBook of The Adventures of Tom Sawyer, Complete by
  Mark Twain (Samuel Clemens)
* Project Gutenberg’s The American Claimant, by Mark Twain (Samuel
  Clemens)
* The Project Gutenberg EBook of Adventures of Huckleberry Finn, Complete
  by Mark Twain (Samuel Clemens)

In [4]:
mark_twain_1 = reading('http://www.gutenberg.org/files/74/74-0.txt', 'UTF-8')
mark_twain_2 = reading('http://www.gutenberg.org/files/3179/3179-0.txt')
mark_twain_3 = reading('http://www.gutenberg.org/files/76/76-0.txt')

### Jules Verne
Note: he did not write in English
* The Project Gutenberg EBook of Around the World in 80 Days, by Jules Verne
* Project Gutenberg's A Journey to the Interior of the Earth, by Jules Verne
* The Project Gutenberg EBook of The Master of the World, by Jules Verne

In [5]:
jules_verne_1 = reading('http://www.gutenberg.org/cache/epub/103/pg103.txt')
jules_verne_2 = reading('http://www.gutenberg.org/cache/epub/3748/pg3748.txt')
jules_verne_3 = reading('http://www.gutenberg.org/cache/epub/3809/pg3809.txt')

### Walter Scott
* The Project Gutenberg EBook of The Black Dwarf, by Sir Walter Scott
* The Project Gutenberg EBook of The Heart of Mid-Lothian, Complete,
* The Project Gutenberg EBook of The Talisman, by Sir Walter Scott

In [11]:
walter_scott_1 = reading('http://www.gutenberg.org/files/1460/1460-0.txt')
walter_scott_2 = reading('http://www.gutenberg.org/files/6944/6944-0.txt')
walter_scott_3 = reading('http://www.gutenberg.org/files/1377/1377-0.txt')

In [13]:
# If its 274 or something, it means the page think we are robots
print(['Mark Twain',len(mark_twain_1),len(mark_twain_2),len(mark_twain_3)],
     ['Jules Verne',len(jules_verne_1),len(jules_verne_2),len(jules_verne_3)],
     ['Walter Scott',len(walter_scott_1),len(walter_scott_2),len(walter_scott_3)])

['Mark Twain', 6926, 6301, 9671] ['Jules Verne', 6441, 7552, 4400] ['Walter Scott', 5458, 21178, 11861]
