### Setup & Imports

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [52]:
from fastai import *
from fastai.text import *
import datetime

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

import pandas as pd

### Data

In [74]:
works = ['dostoevsky/the_idiot', 'dostoevsky/the_brothers_karamazov', 'dostoevsky/crime_and_punishment', 
        'tolstoy/war_and_peace', 'tolstoy/anna_karenina']

In [75]:
train_models(works)

epoch,train_loss,valid_loss,accuracy,time
0,3.801318,#na#,00:43,


epoch,train_loss,valid_loss,accuracy,time
0,3.459785,#na#,00:56,
1,3.327889,#na#,00:56,
2,3.125623,#na#,00:56,
3,2.970819,#na#,00:56,
4,2.642975,#na#,00:56,
5,2.418244,#na#,00:57,
6,2.14869,#na#,00:57,
7,1.971007,#na#,00:56,
8,1.860406,#na#,00:56,
9,1.798606,#na#,00:56,


FileNotFoundError: [Errno 2] No such file or directory: '/home/ubuntu/nlp/course-nlp/literature/dostoevsky/the_idiot/models/dostoevsky/the_idiot_weights.pth'

### Train Model

In [73]:
def train_models(works): 
    
    # Write out update
    with open('/home/ubuntu/nlp/course-nlp/progress.txt', 'a+') as f:
        f.write(f'**STARTING TRAINING**\n')
    
    # Train len(works) models
    for work in works: 
        train_model(work)
    
    # Write out update
    with open('/home/ubuntu/nlp/course-nlp/progress.txt', 'a+') as f:
        f.write(f'**FINISHED TRAINING**')

In [78]:
def train_model(work):
    
    # Write out update
    time_start = datetime.datetime.now()
    with open('/home/ubuntu/nlp/course-nlp/progress.txt', 'a+') as f:
        f.write(f'Training {work} at {time_start} \n')
    
    # Path to the .txt file of the novel on disk
    path = Path('/home/ubuntu/nlp/course-nlp/literature/' + work)
    
    # Data
    data = (TextList.from_folder(path)
        .split_by_rand_pct(0.1, seed=42)
        .label_for_lm()
        .databunch(bs=48, num_workers=1))
    
    # Pretrained Wiki Model 
    lm = language_model_learner(data, AWD_LSTM, drop_mult=0.3)
    
    # Learning Rate via lr finder
    lr = 1e-3
    
    # For efficiency
    lm.to_fp16();
    
    # Train last layers
    lm.fit_one_cycle(1, lr*10, moms=(0.8,0.7))
    
    # Train all layers for 10 epochs
    lm.unfreeze()
    lm.fit_one_cycle(10, lr, moms=(0.8,0.7))
    
    # Save weights
    lm.save(work.split('/')[1])
    
    # Write out update
    time_end = datetime.datetime.now()
    with open('/home/ubuntu/nlp/course-nlp/progress.txt', 'a+') as f:
        f.write(f'Trained {work} at {datetime.datetime.now()}\n')
        f.write(f'Took {(time2 - time1).total_seconds() / 60 } minutes\n')
        f.write(f'....................\n')
        
        

The history saving thread hit an unexpected error (OperationalError('database or disk is full',)).History will not be written to the database.


### Predict

#### Load weights 

In [None]:
work = 'dostoevsky/the_idiot'
lm.load(work + '_weights');

#### Helper Functions

In [None]:
def generate_sentence(model, intro, n_words, temperature): 
    """
    Lowering temperature will make the texts less randomized.
    """
    sentence = model.predict(intro, n_words, temperature=0.90)
    return sentence.replace('\n', '')

In [None]:
def last_period(sentence):
    for i, letter in enumerate(reversed(sentence)):
        if letter == '.':
            return len(sentence) - i

In [None]:
def clean_sentence(sentence):
    sentence = sentence.replace(' ,', ',')
    sentence = sentence.replace(' ?', '?')
    sentence = sentence.replace(' .', '.')
    sentence = sentence.replace('( ', '(')
    sentence = sentence.replace(' )', ')')
    sentence = sentence.replace('“ ', '“')
    sentence = sentence.replace(" ’", "’")
    sentence = sentence.replace(" '", "'")    
    sentence = sentence.replace(" :", ":")
    sentence = sentence.replace(' ”', '”')
    sentence = sentence.replace('   ', ' ')
    sentence = sentence.replace('  ', ' ')
    sentence = sentence.replace(" n’t", "n’t")
    sentence = sentence.replace(' i ', ' I ')
    
    # clip to full sentence 
    return sentence[:last_period(sentence)]

#### Generate a bunch of sentences & calculate the score of each

In [None]:
def get_scored_sentences(n, model, intro, words, temperature):
    results = pd.DataFrame(columns=['sentence', 'sentiment'])
    sid = SentimentIntensityAnalyzer()
    for i in range(n):
        sentence = clean_sentence(generate(model, intro, words, temperature))
        sentiment = sid.polarity_scores(sentence)['compound']
        results.loc[i] = [sentence, sentiment]
    results.sort_values('sentiment', ascending=False, inplace=True)
    results = results.reset_index().drop('index', axis=1)
    return results

### Results

In [None]:
results_tolstoy = get_scored_sentences(50, lm_tolstoy, 'In the afternoon', 30, 0.5)
results_dostoevsky = get_scored_sentences(50, lm_dostoevsky, 'In the afternoon', 30, 0.5)

#### Overall sentiment of Crime and Punishment

In [None]:
with open('/home/ubuntu/nlp/course-nlp/max_data/literature_testing/crime_and_punishment.txt', 'r') as f:
    text = f.read()

In [None]:
sid.polarity_scores(text)