In [28]:
import math, os
import pandas as pd 
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from utilities import text_functions as tf
from utilities import scikit_functions as sf

In [2]:
df_meta = pd.read_csv('meta.csv', index_col=0)
type_summary = df_meta.groupby('review_type').agg({'review_type':['count',]}).reset_index(drop=False).sort_values(by=('review_type', 'count'), ascending=False)
type_summary

Unnamed: 0_level_0,review_type,review_type
Unnamed: 0_level_1,Unnamed: 1_level_1,count
5,single_focus,1093
2,multi,487
4,not_review,247
1,cluster,244
0,brief,82
3,no_pdf,2


In [3]:
df_single = pd.read_csv('single_author_meta.csv', index_col=0)
df_single_genre_summary = df_single.groupby('genre_parsed').agg({'genre_parsed':['count',]}).reset_index(drop=False).sort_values(by=('genre_parsed', 'count'), ascending=False)
df_single_genre_summary

Unnamed: 0_level_0,genre_parsed,genre_parsed
Unnamed: 0_level_1,Unnamed: 1_level_1,count
2,nonfiction,801
1,fiction,226
3,poetry,34
0,drama,18


In [5]:
reviews_all = []
for i in df_single['record_id']:
    with open (f'txt/{i}.txt') as f:
        this_review = f.read()
    f.close()
    reviews_all.append(this_review)
len(reviews_all)

1079

In [6]:
# dict of tokens, key - pub_id
review_store = {}

for e, i in enumerate(df_single['record_id']):
    p = tf.preprocess_text(reviews_all[e])
    review_store[i] = p

In [7]:
ngram_stores_lower = {}
for i in range(0,5):
    if i == 0:
        ngram_store = {x: Counter([j.lower() for j in y]) for x,y in review_store.items()}
        ngram_stores_lower[i] = ngram_store
    else:
        ngram_store = tf.make_ngram_store(review_store, i, lower=True)
        ngram_stores_lower[i] = ngram_store

In [27]:
base = 'extracted_features/ngrams'

if not os.path.exists(base):
    os.makedirs(base)
    
for e in range(0,5):
    if not os.path.exists(f'{base}/{str(e)}'):
        os.makedirs(f'{base}/{str(e)}')
    for i in ngram_stores_lower[e].keys():
        this_csv= f'{base}/{str(e)}/{str(i)}.csv'
        df = pd.DataFrame().from_records([[k,v] for k,v in ngram_stores_lower[e][i].items()], columns=['ngram', 'count'])
        df.to_csv(this_csv)
            