In [1]:

from variables import sourcepath
import os
import pandas as pd
import numpy as np
import etl.dim_author as auth
import etl.dim_journal as jour
import etl.common_functions as cof
import etl.dim_paper as pape
import etl.database as db
from db_credentials import dwh_db_connection_params
import roman
import datetime


In [57]:
eng, psycop2connect=db.initialize_engine(connection_params=dwh_db_connection_params)
paragraphs_in_dwh=db.load_full_table(eng, 'dim_paragraph')


In [53]:
source_sentences=cof.load_sourcefile('sentences.csv')[['sentence_id', 'para_id', 'sentence', 'sentence_type']]
source_sentences

Unnamed: 0,sentence_id,para_id,sentence,sentence_type
0,1_0_1,1_0,,EMPTY
1,1_1_2,1_0,START_TITLE_TAG,TAG
2,1_2_23,1_0,Examining interdependence between product user...,PARAGRAPH
3,1_23_24,1_1,START_ABSTRACT_TAG,TAG
4,1_24_25,1_1,,EMPTY
...,...,...,...,...
7457421,7484_13601_13602,7484_139,,EMPTY
7457422,7484_13602_13603,7484_139,END_TABLE_TAG,TAG
7457423,7484_13603_13604,7484_139,,EMPTY
7457424,7484_13604_13605,7484_139,END_TABLE_TAG,TAG


In [54]:
source_sentences.shape

(7457426, 4)

In [55]:
source_sentences.dropna(axis=0, subset=['sentence'], inplace=True)
#drop sentences that only contain of whitespaces
source_sentences=source_sentences[~source_sentences.sentence.str.isspace()]
#drop entries that have most likely no meaningful entities attached to them
source_sentences=source_sentences[source_sentences.sentence_type.apply(lambda s: False if s in ['TAG', 'TABLE', 'EMPTY', 'FORMULA', 'TABLE_HEADER', 'FIGURE_HEADER', 'FIGURE', 'HYP_NUMBER', 'RQ_NUMBER'] else True)]

In [58]:
citations=cof.load_sourcefile('citations.csv')[['sentence_id', 'reference_citekey']]
papers_in_dwh=db.load_full_table(eng, 'dim_paper')[['citekey', 'paper_pk']]
citations_with_pk=pd.merge(citations, papers_in_dwh, how='left', left_on='reference_citekey', right_on='citekey')[['sentence_id', 'paper_pk']]
sentences_with_reference_pk=pd.merge(source_sentences, citations_with_pk, how='left', on='sentence_id')

In [66]:
#get paragraph_pk as foreign key
paragraphs_in_dwh=db.load_full_table(eng, 'dim_paragraph')[['paragraph_pk', 'para_source_id']]
sentences_with_para_pk=pd.merge(sentences_with_reference_pk, paragraphs_in_dwh, how='left', left_on='para_id', right_on='para_source_id').drop(columns=['para_id', 'para_source_id'])

In [81]:
#add some strategies for missing values
sentences_with_para_pk.fillna({'sentence_id': '0', 'sentence': 'MISSING', 'sentence_type': 'MISSING', 'paper_pk': 0, 'paragraph_pk': 0}, axis=0, inplace=True)

In [84]:
transformed_sentences=sentences_with_para_pk

In [89]:
sentences_in_dwh=db.load_full_table(eng, 'dim_sentence')['sentence_source_id']

In [86]:
transformed_sentences

Unnamed: 0,sentence_id,sentence,sentence_type,paper_pk,paragraph_pk
0,1_2_23,Examining interdependence between product user...,PARAGRAPH,0.0,1.0
1,1_25_26,A,ABSTRACT,0.0,2.0
2,1_26_29,B S T,ABSTRACT,0.0,2.0
3,1_29_31,R A,ABSTRACT,0.0,2.0
4,1_31_51,C TFirm - sponsored online user communities ha...,ABSTRACT,0.0,2.0
...,...,...,...,...,...
3620500,7484_13487_13503,"In addition , he has published over 100 articl...",CAPTION,0.0,0.0
3620501,7484_13503_13545,His work has also been featured by a number of...,CAPTION,0.0,0.0
3620502,7484_13545_13568,He has been interviewed by the press on outlet...,CAPTION,0.0,0.0
3620503,7484_13568_13586,Dr. Desouza has received over $ 1.2 million of...,CAPTION,0.0,0.0


In [91]:
sentences_in_dwh.to_list()

[]

In [98]:
delta_sentences=transformed_sentences[transformed_sentences.sentence_id.apply(lambda i: False if  i in sentences_in_dwh.to_list() else True)]

In [99]:
#assign citationgroup_pk
max_citationgroup_pk=123
delta_sentences['citationgroup_pk']=delta_sentences.groupby(by='sentence_id').ngroup(ascending=True)+max_citationgroup_pk

In [105]:
#separate citation_paper_bridge
bridge_sentence_citation=delta_sentences[['citationgroup_pk', 'paper_pk']]

In [108]:
delta_sentences.shape

(3620505, 6)

In [110]:
#now drop unnecessary columns, remove then the duplicated sentence rows and rename columns so they fit to the db table
delta_sentences=delta_sentences.drop(columns=['paper_pk']).drop_duplicates().rename({'sentence_id': 'sentence_source_id', 'sentence': 'sentence_string'}, axis=1)

Unnamed: 0,sentence_source_id,sentence_string,sentence_type,paragraph_pk,citationgroup_pk
0,1_2_23,Examining interdependence between product user...,PARAGRAPH,1.0,501308
1,1_25_26,A,ABSTRACT,2.0,501292
2,1_26_29,B S T,ABSTRACT,2.0,501296
3,1_29_31,R A,ABSTRACT,2.0,501307
4,1_31_51,C TFirm - sponsored online user communities ha...,ABSTRACT,2.0,501317
...,...,...,...,...,...
3620500,7484_13487_13503,"In addition , he has published over 100 articl...",CAPTION,0.0,3198511
3620501,7484_13503_13545,His work has also been featured by a number of...,CAPTION,0.0,3198513
3620502,7484_13545_13568,He has been interviewed by the press on outlet...,CAPTION,0.0,3198514
3620503,7484_13568_13586,Dr. Desouza has received over $ 1.2 million of...,CAPTION,0.0,3198515


In [104]:
delta_sentences[delta_sentences.duplicated(subset=['sentence_id'], keep=False)].sort_values(by='sentence_id')

Unnamed: 0,sentence_id,sentence,sentence_type,paper_pk,paragraph_pk,citationgroup_pk
464192,1000_11082_11116,"The reverse is usually the case ( see , for ex...",PARAGRAPH,40967.0,111414.0,181
464193,1000_11082_11116,"The reverse is usually the case ( see , for ex...",PARAGRAPH,2798.0,111414.0,181
463823,1000_1325_1388,Several studies have failed to provide evidenc...,PARAGRAPH,40969.0,111331.0,297
463820,1000_1325_1388,Several studies have failed to provide evidenc...,PARAGRAPH,40971.0,111331.0,297
463821,1000_1325_1388,Several studies have failed to provide evidenc...,PARAGRAPH,40991.0,111331.0,297
...,...,...,...,...,...,...
51115,99_9448_9540,It is important to consider and perhaps counte...,PARAGRAPH,11560.0,12843.0,3324169
51123,99_9754_9796,"In general , women face more gender - related ...",PARAGRAPH,11535.0,12845.0,3324178
51124,99_9754_9796,"In general , women face more gender - related ...",PARAGRAPH,11554.0,12845.0,3324178
51126,99_9823_9882,Other reasons for this may be that men and wom...,PARAGRAPH,11535.0,12845.0,3324181


In [33]:
source_sentences.iloc[0].sentence

'  '