In [1]:
import pandas as pd
%matplotlib inline

import matplotlib
import numpy as np
import matplotlib.pyplot as plt

In [2]:
stories = pd.read_csv('story_10k_authors.csv')
chapters = pd.read_csv('chapter_10k_authors.csv')
reviews = pd.read_csv('review_10k_authors.csv')
reviews_received = pd.read_csv('review_10k_authors_received.csv')
reviews_sent = pd.read_csv('review_10k_authors_sent.csv')

In [3]:
story_df = pd.read_csv('story.csv', error_bad_lines=False)
story_df.columns = ['id', 'fandom_id', 'user_id', 'rating_id', 'language_id', 'ff_story_id', 'title', 'chapters', 
                    'words', 'reviews', 'favorites', 'followers', 'date_published', 'date_updated', 'is_complete']
story_df.rename(columns={'id':'story_id'}, inplace=True)
print(story_df.shape)

b'Skipping line 846594: expected 15 fields, saw 16\n'
b'Skipping line 921885: expected 15 fields, saw 17\nSkipping line 944703: expected 15 fields, saw 16\n'
b'Skipping line 1432931: expected 15 fields, saw 16\n'
b'Skipping line 2681587: expected 15 fields, saw 16\n'
b'Skipping line 2714733: expected 15 fields, saw 16\nSkipping line 2722010: expected 15 fields, saw 17\n'
b'Skipping line 2873331: expected 15 fields, saw 16\nSkipping line 2877823: expected 15 fields, saw 16\n'
b'Skipping line 3163468: expected 15 fields, saw 16\n'
b'Skipping line 4059418: expected 15 fields, saw 16\n'
b'Skipping line 4106282: expected 15 fields, saw 16\nSkipping line 4108099: expected 15 fields, saw 16\n'
b'Skipping line 4368618: expected 15 fields, saw 16\nSkipping line 4369413: expected 15 fields, saw 17\n'
b'Skipping line 4951246: expected 15 fields, saw 16\nSkipping line 4978213: expected 15 fields, saw 16\n'
b'Skipping line 5660786: expected 15 fields, saw 17\n'
b'Skipping line 6070843: expected 15 

(6828922, 15)


In [4]:
stories = stories.merge(story_df[['story_id', 'date_updated', 'chapters']], on='story_id', how='left')

In [5]:
srr = stories.merge(reviews_received, on = ['author_id' ,'story_id'], how='left')

In [6]:
chapters['chapter_predicted_date_published_new'] = None
chapters.rename(columns={'date_published':'chapter_predicted_date_published_old'}, inplace=True)

In [7]:
cbs = chapters.merge(stories[['story_id', 'fandom_id', 'date_published', 'is_complete',
       'date_updated', 'chapters']], on='story_id')

In [8]:
# Rules
# if # of chapter == 1 then date = date published 
# if # of chapter > 1
#     then first chapter date = date published 
#     last chapter date = date updated - edge case if story not updated updated 
# 

In [9]:
cbs.loc[(cbs['chapter_id'] == cbs['chapters']) 
       & (cbs['date_updated'] != -1) 
       & (cbs['chapter_id'] != 1), 'chapter_predicted_date_published_new'] = cbs['date_updated']
cbs.loc[cbs['chapter_id'] == 1, 'chapter_predicted_date_published_new'] = cbs['date_published']


In [10]:
print('Number of stories with more than one chapter and no update date =', 
      (cbs[(cbs['chapters']==cbs['chapter_id']) & 
           (cbs['chapter_predicted_date_published_new'].isnull())]['story_id'].nunique())/len(stories)*100, '%')


Number of stories with more than one chapter and no update date = 3.0430385436565905 %


In [11]:
min_review_date_per_chapter = reviews_received.groupby(['author_id', 'story_id', 'chapter_id'])['review_date'].min().reset_index()



In [12]:
cbs_v2 = cbs.merge(min_review_date_per_chapter, on=['author_id', 'story_id', 'chapter_id'], how='left')

In [13]:
cbs_v2['next_chapter_review_date'] = cbs_v2.groupby(['author_id', 'story_id'])['review_date'].shift(-1)
cbs_v2['prev_chapter_review_date'] = cbs_v2.groupby(['author_id', 'story_id'])['review_date'].shift(1)


In [14]:
len_of_updated = len(cbs_v2[(cbs_v2['chapter_id'] == cbs_v2['chapters']) 
       & (cbs_v2['date_updated'] != -1)
       & (~cbs_v2['review_date'].isnull()) & (cbs_v2['review_date'] < cbs_v2['date_updated'])])

len_of_updated_1 = len(cbs_v2[(cbs_v2['chapter_id'] == cbs_v2['chapters']) 
       & (cbs_v2['date_updated'] != -1) & (cbs['chapter_id'] == 1)
       & (~cbs_v2['review_date'].isnull()) & (cbs_v2['review_date'] < cbs_v2['date_updated'])])

print('Total number of stories updated after getting first review on the last chapter = ', 
     len_of_updated/len(stories)*100, '% Count = ', len_of_updated,'. Out of which', 
      len_of_updated_1, 'are stories with only one chapter.')

Total number of stories updated after getting first review on the last chapter =  9.612315990560736 % Count =  4277 . Out of which 233 are stories with only one chapter.


In [15]:
# if review date of first review is less than update date of the story then use first review date of the chapter 
cbs_v2.loc[((~cbs_v2['review_date'].isnull()) & 
            (cbs_v2['review_date'] <= cbs_v2['date_updated']) & 
            (((cbs_v2['review_date'] <= cbs_v2['next_chapter_review_date']) & 
             (cbs_v2['review_date'] >= cbs_v2['prev_chapter_review_date'])) | 
             ((cbs_v2['review_date'] <= cbs_v2['next_chapter_review_date']) & 
             (cbs_v2['prev_chapter_review_date']).isnull()) | 
             ((cbs_v2['review_date'] >= cbs_v2['prev_chapter_review_date']) & 
             (cbs_v2['next_chapter_review_date']).isnull()))), 
           'chapter_predicted_date_published_new'] = cbs_v2['review_date']


In [16]:
# cbs_v2[((cbs_v2['review_date'] <= cbs_v2['next_chapter_review_date']) & 
#         (cbs_v2['review_date'] >= cbs_v2['prev_chapter_review_date']))]

In [17]:
# if date updated is equal to -1 for a story that 
# implies that all the chapters were published at the same time so using date published for all the chapters 
cbs_v2.loc[((cbs_v2['date_updated'] == -1)), 
           'chapter_predicted_date_published_new'] = cbs_v2['date_published']

In [18]:
# number of chapters where the review date is less than story update date but 
# current chapter review date is either greater than next chapter first review date 
# or less than previous chapter first review date 
# cbs_v2[(cbs_v2['review_date'] <= cbs_v2['date_updated']) & 
#        ((cbs_v2['review_date'] > cbs_v2['next_chapter_review_date']) | 
#         (cbs_v2['review_date'] < cbs_v2['prev_chapter_review_date']))]

In [19]:
# cbs_v2[cbs_v2['chapter_predicted_date_published_new'].isnull()]
# None of them are last chapters - n have no reviews and 
# m have reviews where the first review date on the chapter is greater than story update date 

In [20]:
len(cbs_v2[cbs_v2['chapter_predicted_date_published_new'].isnull()])/len(cbs_v2)

0.16521400338081982

In [21]:
cbs_v2.to_csv('chapters_prediction.csv')