This document joins corpus data downloaded from Sketch Engine (www.sketchengine.eu) with the information data about each hospital and report that has been downloaded from CQC (www.cqc.org.uk) and formatted. The end document that is exported will be used for modelling

## Loading Libraries

In [1]:
import pandas as pd
import numpy as np

## Corpus Information File

In [3]:
# loading the file with information about CQC corpus 

file_info = pd.read_csv("_CQC_Corpus_Info.csv", index_col = 0)

print(file_info.columns.values)

file_info[:2]

['providerId' 'locationId' 'organisationType' 'type' 'name' 'region'
 'postalCode' 'onspdLatitude' 'onspdLongitude' 'rating_overall'
 'reportDate' 'rating_caring' 'rating_effective' 'rating_responsive'
 'rating_safe' 'rating_wellled' 'URL' 'Location_type' 'Location_subtype'
 'Report_URL' 'Corpus_duplication_check']


Unnamed: 0,providerId,locationId,organisationType,type,name,region,postalCode,onspdLatitude,onspdLongitude,rating_overall,...,rating_caring,rating_effective,rating_responsive,rating_safe,rating_wellled,URL,Location_type,Location_subtype,Report_URL,Corpus_duplication_check
1,1-101675619,9999,Provider,Independent Healthcare Org,Healthcare at Home Ltd,West Midlands,DE14 1SZ,52.806769,-1.62564,Good,...,Good,Good,Good,Good,Good,http://www.cqc.org.uk/provider/1-101675619,Independent Healthcare Org,Community health - NHS & Independent,https://www.cqc.org.uk/sites/default/files/new...,included
2,1-102643363,9999,Provider,Independent Healthcare Org,St Andrew's Healthcare,East Midlands,NN1 5DG,52.238142,-0.873701,Requires improvement,...,Good,Good,Good,Requires improvement,Requires improvement,http://www.cqc.org.uk/provider/1-102643363,Independent Healthcare Org,Mental health - community & hospital - indepen...,https://www.cqc.org.uk/sites/default/files/new...,included


In [4]:
# remove all rows where URL is not available
file_info = file_info[~file_info['Report_URL'].isna()]

file_info.shape

(1123, 21)

In [5]:
# creating the file name from URL
def find_partURL(column):
    test_str = column
    return test_str[test_str.rfind('/')+1:test_str.find('.pdf')]

file_info['filename'] = file_info['Report_URL'].apply(find_partURL)

file_info[:5]
file_info.shape

(1123, 22)

## Corpus File

In [7]:
# loading corpus for CQC text
file = '_CQC_Text.txt'
# cqc_documents_v2.txt

file = open(file, encoding="utf8")
file

lines = [line.rstrip('\n') for line in open('_CQC_Text.txt', encoding = "utf8")]

doc_index = []

for counter, value in enumerate(lines):
    if "<doc url=" in value:
        doc_index.append( [counter, value])

In [10]:
#  creating the dataframe for iteration for the tex for one doc at a time
df_1 = pd.DataFrame(doc_index, columns = ("doc_index_first_line", "doc_details"))
# each text starts with the doc_index_first_line + the index of the last element in the "lines" list
df_1["doc_index_last_line"] = list(np.array(df_1.iloc[:,0]))[1:]+[(len(lines))]

list_fullStr = []

# loop for creating the record for each document in the dataset
for i in np.arange(len(doc_index)):
    one_document = (lines[df_1.iloc[i,0]: df_1.iloc[i,2]])
    fullStr = ' '.join(one_document)
    list_fullStr.append(fullStr)
    

df_1["full_text"] = list_fullStr

print('Number of full text empty:' ,sum(df_1['full_text'].isna()))
print('Number of doc details empty:' ,sum(df_1['doc_details'].isna()))

df_1[:2]

Number of full text empty: 0
Number of doc details empty: 0


Unnamed: 0,doc_index_first_line,doc_details,doc_index_last_line,full_text
0,0,"<doc url=""https://www.cqc.org.uk/sites/default...",1200,"<doc url=""https://www.cqc.org.uk/sites/default..."
1,1200,"<doc url=""https://www.cqc.org.uk/sites/default...",2270,"<doc url=""https://www.cqc.org.uk/sites/default..."


In [11]:
# creating the filename for the join to the data info
def find_filename(column):
    test_str = column
    return test_str[test_str.find('filename="')+len('filename="'):test_str.find('.pdf">')]
df_1['filename'] = df_1['doc_details'].apply(find_filename)

In [12]:
# creating one file with the full text and all text metadata
df = df_1.merge(file_info)

# example
df.loc[df.filename == 'AAAJ3228']

Unnamed: 0,doc_index_first_line,doc_details,doc_index_last_line,full_text,filename,providerId,locationId,organisationType,type,name,...,rating_caring,rating_effective,rating_responsive,rating_safe,rating_wellled,URL,Location_type,Location_subtype,Report_URL,Corpus_duplication_check
621,1871839,"<doc url=""https://www.cqc.org.uk/sites/default...",1873961,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAJ3228,1-101727990,1-3811878726,Location,Independent Healthcare Org,Spamedica Limited,...,Good,Outstanding,Good,Good,Good,http://www.cqc.org.uk/location/1-3811878726,Independent Healthcare Org,Acute hospital - Independent non-specialist,https://www.cqc.org.uk/sites/default/files/new...,included


In [None]:
# export

# df.to_csv('CQC_documents_df_V2.csv')

In [None]:
# the same process was applied to the Welsh Corpus and Welsh details document
# attachments:
# Corpus CQC _CQC_Text
# Corpus Welsh _hiw_Text
# details document CQC "_CQC_Corpus_Info.csv"