# Metadata

```
Course:   DS 5001
Module:   12 Lab
Topic:    Save Novels with Emotions 
Author:   R.C. Alvarado

Purpose:  Collect all the novel collections we have and combine each novel with the combined sentiment lexicon table we created last time.
```

# Set Up

In [1]:
data_home = "../data"

In [2]:
config = {
    'novels': {
        'OHCO': 'book_id chap_id para_num sent_num token_num'.split(),
        'LIB': 'LIB',
        'TOKENS': 'CORPUS',
        'path': 'novels'
    },
    'austen-melville': {
        'OHCO': 'book_id chap_id para_num sent_num token_num'.split(),
        'LIB': 'LIB2',
        'TOKENS': 'CORPUS2',
        'path': 'output'
    }
}

In [3]:
token_cols = ['pos','term_str']
salex_csv = f'{data_home}/salex/salex_combo.csv'

In [4]:
import pandas as pd

# Get Lexicons

We created this last week.

In [5]:
SALEX = pd.read_csv(salex_csv).set_index('term_str')
SALEX['nrc_polarity'] = SALEX.nrc_positive - SALEX.nrc_negative

In [6]:
SALEX.columns.tolist()

['nrc_anger',
 'nrc_anticipation',
 'nrc_disgust',
 'nrc_fear',
 'nrc_joy',
 'nrc_negative',
 'nrc_positive',
 'nrc_sadness',
 'nrc_surprise',
 'nrc_trust',
 'nrc_polarity',
 'bing_negative',
 'bing_positive',
 'bing_sentiment',
 'syu_sentiment',
 'gi_sentiment',
 'labmt_happiness',
 'labmt_z']

# Import Texts

We import two sets of pre-processed novels and combine them.

In [7]:
TOKENS = {} # Dict of dataframes
LIB = {} # Dict of dataframes
for prefix in config:
    path = config[prefix]['path']

    token_file = f"{data_home}/{path}/{prefix}-{config[prefix]['TOKENS']}.csv"
    print(token_file)
    TOKENS[prefix] = pd.read_csv(token_file).set_index(config[prefix]['OHCO'])[token_cols]
    
    lib_file = f"{data_home}/{path}/{prefix}-{config[prefix]['LIB']}.csv"
    LIB[prefix] = pd.read_csv(lib_file)

../data/novels/novels-CORPUS.csv
../data/output/austen-melville-CORPUS2.csv


# Standardize the two `LIB` tables

In [8]:
LIB['novels']['title'] = LIB['novels']['book_id'].str.upper()
LIB['novels'] = LIB['novels'].set_index('book_id')
LIB['novels'] = LIB['novels'][['author_id', 'title']]
LIB['novels']['corpus'] = 'novels' 

In [13]:
# LIB['novels']

In [10]:
LIB['austen-melville'] = LIB['austen-melville'].set_index('book_id')
LIB['austen-melville'] = LIB['austen-melville'][['author', 'title']]
LIB['austen-melville']['corpus'] = 'austen-melville'
LIB['austen-melville']['author_id'] = LIB['austen-melville'].author.apply(lambda x: x.split(',')[0].lower())
LIB['austen-melville'] = LIB['austen-melville'].drop(['author'], axis=1)

In [14]:
# LIB['austen-melville']

## Concat into one LIB

In [15]:
LIB_ALL = pd.concat([LIB[prefix] for prefix in config])

In [16]:
LIB_ALL.sample(10)

Unnamed: 0_level_0,author_id,title,corpus
book_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
141,austen,MANSFIELD PARK,austen-melville
946,austen,LADY SUSAN,austen-melville
15422,melville,ISRAEL POTTER HIS FI,austen-melville
usher,poe,USHER,novels
frankenstein,shelley,FRANKENSTEIN,novels
adventures,doyle,ADVENTURES,novels
scarlet,doyle,SCARLET,novels
pitandpendulum,poe,PITANDPENDULUM,novels
161,austen,SENSE AND SENSIBILIT,austen-melville
158,austen,EMMA,austen-melville


## Combine into one TOKENS

In [17]:
TOKENS_ALL = pd.concat([TOKENS[prefix] for prefix in config])

In [18]:
TOKENS_ALL

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,pos,term_str
book_id,chap_id,para_num,sent_num,token_num,Unnamed: 5_level_1,Unnamed: 6_level_1
secretadversary,1,0,1,0,DT,the
secretadversary,1,0,1,1,NNP,young
secretadversary,1,0,1,2,NNP,adventurers
secretadversary,1,0,1,3,NNP,ltd
secretadversary,1,1,0,0,JJ,tommy
...,...,...,...,...,...,...
34970,114,24,0,6,DT,the
34970,114,24,0,7,NNP,ambiguities
34970,114,24,0,8,IN,by
34970,114,24,0,9,NNP,herman


## Merge with SALEX

In [19]:
TOKENS_SENT = TOKENS_ALL.reset_index().merge(SALEX, on='term_str', how='left')

In [20]:
TOKENS_SENT.sample(10)

Unnamed: 0,book_id,chap_id,para_num,sent_num,token_num,pos,term_str,nrc_anger,nrc_anticipation,nrc_disgust,...,nrc_surprise,nrc_trust,nrc_polarity,bing_negative,bing_positive,bing_sentiment,syu_sentiment,gi_sentiment,labmt_happiness,labmt_z
2144262,1212,17,0,5,67,NN,intimacy,,,,...,,,,0.0,1.0,1.0,0.8,1.0,,
1664966,141,2,2,0,24,JJS,least,,,,...,,,,,,,,,4.0,-2.267613
3238649,21816,48,32,3,4,JJ,such,,,,...,,,,,,,,,5.16,-1.198395
2395211,2701,4,17,0,14,NNS,gates,,,,...,,,,,,,,,5.28,-1.087786
1793842,141,39,11,0,47,DT,no,,,,...,,,,,,,,,3.48,-2.746918
2757020,8118,88,23,2,9,IN,than,,,,...,,,,,,,,,4.74,-1.585526
754698,monk,5,14,0,6,NN,crimson,,,,...,,,,,,,,,6.0,-0.424133
2213677,1342,31,23,1,9,NN,trouble,,,,...,,,,1.0,0.0,-1.0,-0.5,-1.0,2.78,-3.392136
1599695,121,9,4,0,2,VBD,said,,,,...,,,,,,,,,5.38,-0.995612
1597496,121,8,10,1,11,CC,and,,,,...,,,,,,,,,5.22,-1.143091


## Export to Files

In [23]:
LIB_ALL.to_csv(f"{data_home}/combo/combo-LIB.csv")

In [24]:
import re
for book_id in LIB_ALL.index:
    corpus = LIB_ALL.loc[book_id].corpus
    filename = '-'.join([corpus, str(book_id)]) + '.csv'
    print(filename)
    cols = TOKENS_SENT.columns[1:]
    TOKENS_SENT.loc[TOKENS_SENT.book_id == book_id, cols].to_csv(f"{data_home}/combo/{filename}", index=False)
print("Done.")

novels-secretadversary.csv
novels-styles.csv
novels-moonstone.csv
novels-adventures.csv
novels-baskervilles.csv
novels-scarlet.csv
novels-signoffour.csv
novels-marieroget.csv
novels-ruemorgue.csv
novels-northangerabbey.csv
novels-christmascarole.csv
novels-monk.csv
novels-pitandpendulum.csv
novels-reddeath.csv
novels-usher.csv
novels-udolpho.csv
novels-oldenglishbaron.csv
novels-frankenstein.csv
novels-dracula.csv
novels-castleofotranto.csv
austen-melville-105.csv
austen-melville-121.csv
austen-melville-141.csv
austen-melville-158.csv
austen-melville-161.csv
austen-melville-946.csv
austen-melville-1212.csv
austen-melville-1342.csv
austen-melville-1900.csv
austen-melville-2701.csv
austen-melville-4045.csv
austen-melville-8118.csv
austen-melville-10712.csv
austen-melville-13720.csv
austen-melville-13721.csv
austen-melville-15422.csv
austen-melville-21816.csv
austen-melville-34970.csv
Done.
