In [21]:
import spacy
nlp = spacy.load('en')

def clean_text(text):
    #lowercase all
    ocr_lower = text.lower()
    #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
    ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
    try:
        ocr_cleaner = unicode(ocr_cleaner)
    except:
        ocr_cleaner = ocr_cleaner.decode('latin-1')
    doc = nlp(ocr_cleaner)
    ocr_tokens = []
    for token in doc:
        ocr_tokens.append(unicode(token))
        
    no_numbers_or_punct = []
    for token in ocr_tokens:
        if token.isalpha():
            no_numbers_or_punct.append(token)
        else:

            new_token = ""
            for letter in token:
                if letter.isalpha():
                    new_token += letter
            if new_token != "":
                no_numbers_or_punct.append(new_token)  
        
    return no_numbers_or_punct

In [56]:
percy = ['corpus/st-irvyne.txt', 'corpus/on-love-and-other-essays.txt', 'corpus/zastrozzi.txt']
mary = ['corpus/valperga.txt', 'corpus/history-six-weeks-tour.txt', 'corpus/last-man.txt']
franken_versions = ['corpus/1823_plain.txt','corpus/1831_plain.txt', 'corpus/1818_plain.txt']

#process 
metadata = []
texts = []
#load texts
import glob
files = glob.glob("corpus/*.txt")
for i in files:
    with open(i) as f:
        txt = f.read()
        #tokenize
        processed = clean_text(txt)
        #get word counts
        wc = len(processed)
    texts.append(processed)
    filename = i.replace('corpus/', '').replace('.txt', '')
    if i in percy:
        row = ["percy", filename, wc] 
    if i in mary: 
        row = ["mary", filename, wc]
    if i in franken_versions:
        row = ["franken", filename, wc]
    metadata.append(row)
    

In [57]:
#store rows as pickle files
import pickle
import pandas as pd
with open('texts.pickle', 'wb') as handle:
    pickle.dump(texts, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('metadata.pickle', 'wb') as handle:
    pickle.dump(metadata, handle, protocol=pickle.HIGHEST_PROTOCOL)

#make df
df = pd.DataFrame.from_records(metadata, columns=["author", "filename", "word_count"])
df

Unnamed: 0,author,filename,word_count
0,franken,1823_plain,72594
1,mary,valperga,157121
2,percy,st-irvyne,31817
3,franken,1831_plain,75760
4,franken,1818_plain,72621
5,mary,history-six-weeks-tour,18968
6,mary,last-man,176938
7,percy,zastrozzi,30934
8,percy,on-love-and-other-essays,28600


In [135]:
from random import shuffle
from collections import Counter

def chunker(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

#separate training and test
labels = []
chunk_list = []
counter_chunk_list = []
test_chunk_list = []
test_chunk_counter_list =[]
for index, text in enumerate(texts):
    chunks = chunker(text, 4000)
    chunks = [u for u in chunks if len(u) > 3500]
    shuffle(chunks)
    chunks = chunks[:9]
    counter_chunks = [Counter(g) for g in chunks]
    if index in [0, 3, 4]:
        test_chunk_list.extend(chunks)
        test_chunk_counter_list.extend(counter_chunks)
    if index in [1, 5, 6]:
        chunk_list.extend(chunks)
        counter_chunk_list.extend(counter_chunks)
        for z in chunks:
            labels.append(0)
    if index in [2, 7, 8]:
        chunk_list.extend(chunks)
        counter_chunk_list.extend(counter_chunks)
        for z in chunks:
            labels.append(1)
    print(index, len(chunks))
Counter(labels).most_common()

(0, 9)
(1, 9)
(2, 8)
(3, 9)
(4, 9)
(5, 4)
(6, 9)
(7, 7)
(8, 7)


[(0, 22), (1, 22)]

In [136]:
try:
    import requests

    words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words')
    stoplist1 = words.text.split("\r\n")

    from nltk.corpus import stopwords
    stoplist2 = set(stopwords.words('english'))

    stoplist1.extend(stoplist2)

    fullstops = list(set(stoplist1))
    with open('fullstops.pickle', 'wb') as handle2:
        pickle.dump(fullstops, handle2, protocol=pickle.HIGHEST_PROTOCOL)
except:
    pass

In [137]:
with open('fullstops.pickle', 'rb') as handle5:
    fullstops = pickle.load(handle5)
len(test_chunk_list)

27

In [139]:
#train chunks to dictionaries
stop_features_train_list = dictionaries_of_features(counter_chunk_list, fullstops)
stop_features_train_list[0]['the']

272

In [140]:
stop_features_test_list = dictionaries_of_features(test_chunk_counter_list, fullstops)

In [141]:
all_samples = stop_features_train_list + stop_features_test_list
len(stop_features_train_list)

44

In [142]:
from sklearn.feature_extraction import DictVectorizer
from application.selective_features import dictionaries_without_features, dictionaries_of_features
from sklearn.linear_model import LogisticRegression

#train a model on train chunks
#instantiate vectorizer
v = DictVectorizer()
#transform all
X = v.fit_transform(all_samples)
#convert to nonsparse
scaled_vsm = X.toarray()

#train logistic on first 44
lr = LogisticRegression()
lr.fit(scaled_vsm[0:44], labels)
len(scaled_vsm[0:44]) == len(labels)

True

In [143]:
preds = lr.predict(scaled_vsm[44:])
probs = lr.predict_proba(scaled_vsm[44:])

In [144]:
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0])

In [146]:
print(len(preds))
for h,i in enumerate(preds):
    print i, probs[h][0], probs[h][1]

27
0 0.9999999985912944 1.408705616529943e-09
0 0.9999999556899809 4.431001911505475e-08
0 0.9999999971363603 2.8636396908267516e-09
0 0.9999999143287365 8.567126355854404e-08
0 0.9967294184566055 0.003270581543394548
0 0.9999928107608674 7.189239132566104e-06
0 0.9999999782250559 2.1774944096427544e-08
0 0.9999999956929638 4.30703615015119e-09
0 0.9998000605279981 0.0001999394720019693
0 0.9999999833994171 1.660058296347535e-08
0 0.999999875666924 1.2433307596990115e-07
0 0.9940768118024936 0.005923188197506338
0 0.9999999992555831 7.444168247128601e-10
0 0.9999998529026518 1.470973482154441e-07
0 0.9999994922789138 5.077210861349699e-07
0 0.9999919707355938 8.029264406212783e-06
0 0.999999994676023 5.323977003002001e-09
0 0.9999999955350581 4.464941852639941e-09
0 0.9999999951525894 4.8474105806390094e-09
0 0.9999999678316999 3.216830017470894e-08
0 0.9999988316708783 1.1683291216925451e-06
0 0.9999980820169146 1.9179830853865363e-06
0 0.999999981708437 1.8291563075054573e-08
0 0.999

In [147]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
#train nb on first 44
gnb.fit(scaled_vsm[0:44], labels)
len(scaled_vsm[0:44]) == len(labels)

True

In [157]:
preds = lr.predict(scaled_vsm[44:])
probs = lr.predict_proba(scaled_vsm[44:])
for h,i in enumerate(preds):
    print (i, format(probs[h][0], '.11f'), (format(probs[h][1], '.11f')))

(0, '0.99999999859', '0.00000000141')
(0, '0.99999995569', '0.00000004431')
(0, '0.99999999714', '0.00000000286')
(0, '0.99999991433', '0.00000008567')
(0, '0.99672941846', '0.00327058154')
(0, '0.99999281076', '0.00000718924')
(0, '0.99999997823', '0.00000002177')
(0, '0.99999999569', '0.00000000431')
(0, '0.99980006053', '0.00019993947')
(0, '0.99999998340', '0.00000001660')
(0, '0.99999987567', '0.00000012433')
(0, '0.99407681180', '0.00592318820')
(0, '0.99999999926', '0.00000000074')
(0, '0.99999985290', '0.00000014710')
(0, '0.99999949228', '0.00000050772')
(0, '0.99999197074', '0.00000802926')
(0, '0.99999999468', '0.00000000532')
(0, '0.99999999554', '0.00000000446')
(0, '0.99999999515', '0.00000000485')
(0, '0.99999996783', '0.00000003217')
(0, '0.99999883167', '0.00000116833')
(0, '0.99999808202', '0.00000191798')
(0, '0.99999998171', '0.00000001829')
(0, '0.99981401230', '0.00018598770')
(0, '0.99999288518', '0.00000711482')
(0, '0.99999995642', '0.00000004358')
(0, '0.99999