In [1]:
import pandas as pd 
from utilities import text_functions as tf

In [2]:
df = pd.read_csv('meta.csv', index_col=0)
type_summary = df.groupby('review_type').agg({'review_type':['count',]}).reset_index(drop=False).sort_values(by=('review_type', 'count'), ascending=False)
type_summary

Unnamed: 0_level_0,review_type,review_type
Unnamed: 0_level_1,Unnamed: 1_level_1,count
5,single_focus,1093
2,multi,487
4,not_review,247
1,cluster,244
0,brief,82
3,no_pdf,2


In [3]:
df_single = pd.read_csv('single_author_meta.csv', index_col=0)
df_single_genre_summary = df_single.groupby('genre_parsed').agg({'genre_parsed':['count',]}).reset_index(drop=False).sort_values(by=('genre_parsed', 'count'), ascending=False)
df_single_genre_summary

Unnamed: 0_level_0,genre_parsed,genre_parsed
Unnamed: 0_level_1,Unnamed: 1_level_1,count
2,nonfiction,801
1,fiction,226
3,poetry,34
0,drama,18


In [4]:
reviews_all = []
for i in df_single['record_id']:
    with open (f'txt/{i}.txt') as f:
        this_review = f.read()
    f.close()
    reviews_all.append(this_review)
len(reviews_all)

1079

In [5]:
processed_all = [tf.preprocess_text(r) for r in reviews_all]
candidates_all = [tf.make_author_candidates(p, tf.title_list) for p in processed_all]
associated_all = [tf.associated_names(p, candidates_all[e], tf.title_list, fuzzy=False) for e, p in enumerate(processed_all)]

In [6]:
associated_all[0]

[['Lucien', 'Carr'], ['Ldclen', 'Carr'], ['Carr']]

In [7]:
ac_rows = []
an_rows = []

for e, i in enumerate(df_single['record_id']):
    for ac in candidates_all[e]:
        ac_rows.append([i, ac])
    for an in associated_all[e]:
        an_rows.append([i, an])

df_ac = pd.DataFrame.from_records(ac_rows, columns=['record_id', 'entity'])
df_an = pd.DataFrame.from_records(an_rows, columns=['record_id', 'entity'])

df_ac.to_csv('extracted_features/author_candidates.csv')
df_an.to_csv('extracted_features/associated_names.csv')