### Setup & Imports

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [125]:
from fastai import *
from fastai.text import *
import datetime

from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk.data

import pandas as pd
import numpy as np

### Data

In [3]:
works = ['/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/the_idiot/', 
         '/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/the_brothers_karamazov/', 
         '/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/crime_and_punishment/', 
        '/home/ubuntu/nlp/nlp_lit/literature/tolstoy/war_and_peace/', 
         '/home/ubuntu/nlp/nlp_lit/literature/tolstoy/anna_karenina/']

### Overall sentiment

Results from `overall_sentiment.py`

```
The Idiot
    {'neg': 0.099, 'neu': 0.769, 'pos': 0.132, 'compound': 1.0}

The Brothers Karamazov
    {'neg': 0.116, 'neu': 0.757, 'pos': 0.128, 'compound': 1.0}

Crime and Punishment
    {'neg': 0.115, 'neu': 0.778, 'pos': 0.107, 'compound': -1.0}

War and Peace
    {'neg': 0.092, 'neu': 0.796, 'pos': 0.111, 'compound': 1.0}

Anna Karenina
    {'neg': 0.088, 'neu': 0.789, 'pos': 0.123, 'compound': 1.0}
```

### Train Model

In [21]:
def train_model(path):
    
    # Work name
    name = path.split('/')[-2]
    
    time_start = datetime.datetime.now()
    
    # Data
    data = (TextList.from_folder(Path(path))
        .split_by_rand_pct(0.1, seed=42)
        .label_for_lm()
        .databunch(bs=48))
    
    # Pretrained Wiki Model 
    lm = language_model_learner(data, AWD_LSTM, drop_mult=0.3)
    
    # Learning Rate via lr finder
    lr = 1e-3
    
    # For training efficiency
    lm.to_fp16();
    
    # Train last layers with high learning rate
    lm.fit_one_cycle(1, lr*10, moms=(0.8,0.7))
    
    # Train all layers for 10 epochs
    lm.unfreeze()
    lm.fit_one_cycle(10, lr, moms=(0.8,0.7))
    
    # Save weights
    lm.save('weights')
    
    # Write out update
    time_end = datetime.datetime.now()
    print(f'Took {(time_end - time_start).total_seconds() / 60 } minutes\n')
    print(f'DONE')
        
    return lm

### War & Peace

#### Train

In [35]:
path = '/home/ubuntu/nlp/nlp_lit/literature/tolstoy/war_and_peace/'
lm = train_model(path)

epoch,train_loss,valid_loss,accuracy,time
0,3.725107,#na#,01:44,


epoch,train_loss,valid_loss,accuracy,time
0,3.476871,#na#,02:14,
1,3.368428,#na#,02:15,
2,3.162788,#na#,02:15,
3,2.905936,#na#,02:15,
4,2.659311,#na#,02:15,
5,2.426557,#na#,02:15,
6,2.214016,#na#,02:15,
7,2.102007,#na#,02:16,
8,1.994689,#na#,02:15,
9,1.949026,#na#,02:15,


Trained war_and_peace at 2019-06-21 22:51:30.866527



NameError: name 'time2' is not defined

#### Generate Sentences 

### Predict

#### Load weights 

In [38]:
lm.load('weights');

#### Helper Functions

In [28]:
def generate_sentence(model, intro, n_words, temperature): 
    """
    Lowering temperature will make the texts less randomized.
    """
    sentence = model.predict(intro, n_words, temperature=temperature)
    return sentence.replace('\n', '')

In [12]:
def last_period(sentence):
    for i, letter in enumerate(reversed(sentence)):
        if letter == '.':
            return len(sentence) - i

In [13]:
def clean_sentence(sentence):
    sentence = sentence.replace(' ,', ',')
    sentence = sentence.replace(' ?', '?')
    sentence = sentence.replace(' .', '.')
    sentence = sentence.replace('( ', '(')
    sentence = sentence.replace(' )', ')')
    sentence = sentence.replace('“ ', '“')
    sentence = sentence.replace(" ’", "’")
    sentence = sentence.replace(" '", "'")    
    sentence = sentence.replace(" :", ":")
    sentence = sentence.replace(' ”', '”')
    sentence = sentence.replace('   ', ' ')
    sentence = sentence.replace('  ', ' ')
    sentence = sentence.replace(" n’t", "n’t")
    sentence = sentence.replace(' i ', ' I ')
    
    # clip to full sentence 
    return sentence[:last_period(sentence)]

#### Generate a bunch of sentences & calculate the score of each

In [14]:
def get_scored_sentences(n, model, intro, words, temperature):
    results = pd.DataFrame(columns=['sentence', 'sentiment'])
    sid = SentimentIntensityAnalyzer()
    for i in range(n):
        sentence = clean_sentence(generate_sentence(model, intro, words, temperature))
        sentiment = sid.polarity_scores(sentence)['compound']
        results.loc[i] = [sentence, sentiment]
    results.sort_values('sentiment', ascending=False, inplace=True)
    results = results.reset_index().drop('index', axis=1)
    return results

#### Generated Sentiments 

In [115]:
results = {'anna_karenina':pd.read_csv('anna_karenina.csv'), 
          'war_and_peace':pd.read_csv('war_and_peace.csv'),
          'the_brothers_karamazov':pd.read_csv('the_brothers_karamazov.csv'), 
          'crime_and_punishment':pd.read_csv('crime_and_punishment.csv'), 
          'the_idiot':pd.read_csv('the_idiot.csv')}

#### Original Sentiments

In [116]:
def get_mean_sentiment_score(path):
    
    # Load tokenizer details
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    
    # Create analyzer object
    analyzer = SentimentIntensityAnalyzer()
    
    # Read in data
    with open(path, 'r') as f:
        data = f.read()
        
    # Break up text into sentences
    sentences = tokenizer.tokenize(data)
    
    # Calculate average sentiment score
    total_score = 0.0
    for sentence in sentences: 
        total_score += analyzer.polarity_scores(sentence)['compound']
    
    return total_score / len(sentences)

In [117]:
works = ['/home/ubuntu/nlp/nlp_lit/literature/tolstoy/anna_karenina/anna_karenina.txt', 
         '/home/ubuntu/nlp/nlp_lit/literature/tolstoy/war_and_peace/war_and_peace.txt',
         '/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/crime_and_punishment/crime_and_punishment.txt',
         '/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/the_brothers_karamazov/the_brothers_karamazov.txt',
         '/home/ubuntu/nlp/nlp_lit/literature/dostoevsky/the_idiot/the_idiot.txt']

## Real vs. Fake

In [124]:
for path in works: 
    print(f'Work:          {path.split("/")[-2]}')
    print(f'Gen Score:     {np.round(get_mean_sentiment_score(path), 4)}')
    print(f'Real Score:    {np.round(results[path.split("/")[-2]]["sentiment"].mean(), 4)}')
    print()

Work:          anna_karenina
Gen Score:     0.0832
Real Score:    0.0909

Work:          war_and_peace
Gen Score:     0.0498
Real Score:    0.0029

Work:          crime_and_punishment
Gen Score:     -0.0125
Real Score:    -0.0087

Work:          the_brothers_karamazov
Gen Score:     0.0106
Real Score:    0.0036

Work:          the_idiot
Gen Score:     0.0594
Real Score:    -0.0009



In [396]:
sentences = list(results['the_idiot']['sentence'])

In [397]:
i = -1

In [424]:
i-=1

print(sentences[i])

In the afternoon and evening it was as usual that a fellow for one of those old miserable women of late century had met such a bad business husband as he


In [415]:
print(sentences[i+3])

In the afternoon one of his classes was killed by the indignant traveller Ismailofsky.
