In [1]:
import pandas as pd
import numpy as np

%matplotlib inline
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
import nltk

import re

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
nltk.download('wordnet')

from nltk.corpus import stopwords

from nlppreprocess import NLP
import pandas as pd

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MA069ja\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Load data

In [2]:
df = pd.read_csv("CQC_documents_df_V2.csv", index_col = 0)
df.shape

(927, 25)

In [3]:
df[:2]

Unnamed: 0,doc_index_first_line,doc_details,doc_index_last_line,full_text,filename,providerId,locationId,organisationType,type,name,...,reportDate,rating_caring,rating_effective,rating_responsive,rating_safe,rating_wellled,URL,Location_type,Location_subtype,Report_URL
0,0,"<doc url=""https://www.cqc.org.uk/sites/default...",1200,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA1704,RXR,RXR78,Location,NHS Healthcare Organisation,Blackburn Birthing Centre,...,7/9/2014,Good,Good,Good,Requires improvement,Good,http://www.cqc.org.uk/location/RXR78,NHS Healthcare Organisation,Acute hospital - NHS non-specialist,https://www.cqc.org.uk/sites/default/files/new...
1,1200,"<doc url=""https://www.cqc.org.uk/sites/default...",2270,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA1791,RXH,RXH35,Location,NHS Healthcare Organisation,Bexhill Hospital,...,8/8/2014,Good,Good,Good,Good,Good,http://www.cqc.org.uk/location/RXH35,NHS Healthcare Organisation,Acute hospital - NHS non-specialist,https://www.cqc.org.uk/sites/default/files/new...


In [4]:
# full text column example
df.iloc[541,3]

'<doc url="https://www.cqc.org.uk/sites/default/files/new_reports/AAAJ0605.pdf" parent_folder="web1" id="file14083907" filename="AAAJ0605.pdf"> Centr Central al and North West London NHS Foundation Trust Inspection report Trust Headquarters, 350 Euston Road Regent\'s Place London NW1 3AX Tel: 02032145700 www.cnwl.nhs.uk Date of inspection visit: 16 Jan – 2 Apr 2019 Date of publication: 04/06/2019 We plan our next inspections based on everything we know about services, including whether they appear to be getting better or worse. Each report explains the reason for the inspection. This report describes our judgement of the quality of care provided by this trust. We based it on a combination of what we found when we inspected and other information available to us. It included information given to us from people who use the service, the public and other organisations. This report is a summary of our inspection findings. You can find more detailed information about the service and what we f

## Documents count
Key stats about the size and columns of the document

In [5]:
df.shape

(927, 25)

In [6]:
df.columns.values

array(['doc_index_first_line', 'doc_details', 'doc_index_last_line',
       'full_text', 'filename', 'providerId', 'locationId',
       'organisationType', 'type', 'name', 'region', 'postalCode',
       'onspdLatitude', 'onspdLongitude', 'rating_overall', 'reportDate',
       'rating_caring', 'rating_effective', 'rating_responsive',
       'rating_safe', 'rating_wellled', 'URL', 'Location_type',
       'Location_subtype', 'Report_URL'], dtype=object)

In [7]:
df.describe()

Unnamed: 0,doc_index_first_line,doc_index_last_line,onspdLatitude,onspdLongitude
count,927.0,927.0,927.0,927.0
mean,1431062.0,1433914.0,52.338929,-1.213135
std,753609.4,753190.4,1.078296,1.215762
min,0.0,1200.0,50.122056,-5.542976
25%,820068.0,821154.5,51.460578,-2.139327
50%,1433289.0,1436002.0,52.238142,-1.249739
75%,2105796.0,2107494.0,53.370084,-0.171286
max,2643416.0,2643813.0,55.18431,1.75463


## Dataset by CQC classes

In [8]:
df['rating_overall'].value_counts()

Good                    598
Requires improvement    218
Outstanding              85
Inadequate               26
Name: rating_overall, dtype: int64

## Mental health flag
Creating mental health flag

In [9]:
df['Mental_flag'] = np.where(df['Location_subtype'].str.contains ('Mental'), 1, 0)

In [10]:
df['Mental_flag'].value_counts()

0    630
1    297
Name: Mental_flag, dtype: int64

## Basic EDA

In [11]:
# get a word count per sentence column

def word_count(text):
    # Find the total number of words in the tweet
    total_words = len(word_tokenize(text))
    
    # Return the word counts
    return total_words

def sentence_count(text):
    # Find the total number of words in the tweet
    total_words = len(sent_tokenize(text))
    
    # Return the word counts
    return total_words

def distinct_word_count(text):
    
    # Find the total number of DISTINCT words in the tweet
    total_distinct_words = len(set(w for w in word_tokenize(text)))
    
    return total_distinct_words

def avg_word_len(text):
    
    avg_word_len = np.mean([len(w) for w in word_tokenize(text) ])
    if np.isnan(avg_word_len):
        avg_word_len = 0
    else:
        avg_word_len = avg_word_len
    
    return avg_word_len 

In [12]:
df['word_count'] = df['full_text'].apply(word_count)

In [13]:
df['sentence_count'] = df['full_text'].apply(sentence_count)

In [14]:
df['distinct_word_count'] = df['full_text'].apply(distinct_word_count)

In [15]:
df['avg_word_len'] = df['full_text'].apply(avg_word_len)

## String transformation 
1. Amending stopwords list to INCLUDE word 'not'
2. Limit dataset (remove intial document x lines that are related to the genral information about repors and CQC)
3. Applying text transfomations to full text and on limited texts
4. Additional transformations (creating lists, bigrams, pre-processing by class)

## Amending the stopwords list

In [16]:
set(stopwords.words('english'))
new_stopwords = set(stopwords.words('english')) 
new_stopwords.update('a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides',
           'between',
           'beyond',
           'bill',
           'both',
           'bottom',
           'but',
           'by',
           'call',
           'can',
           'cannot',
           'cant',
           'co',
           'computer',
           'con',
           'could',
           'couldnt',
           'cry',
           'de',
           'describe',
           'detail',
           'did',
           'didn',
           'do',
           'does',
           'doesn',
           'doing',
           'don',
           'done',
           'down',
           'due',
           'during',
           'each',
           'eg',
           'eight',
           'either',
           'eleven',
           'else',
           'elsewhere',
           'empty',
           'enough',
           'etc',
           'even',
           'ever',
           'every',
           'everyone',
           'everything',
           'everywhere',
           'except',
           'few',
           'fifteen',
           'fifty',
           'fill',
           'find',
           'fire',
           'first',
           'five',
           'for',
           'former',
           'formerly',
           'forty',
           'found',
           'four',
           'from',
           'front',
           'full',
           'further',
           'get',
           'give',
           'go',
           'had',
           'has',
           'hasnt',
           'have',
           'he',
           'hence',
           'her',
           'here',
           'hereafter',
           'hereby',
           'herein',
           'hereupon',
           'hers',
           'herself',
           'him',
           'himself',
           'his',
           'how',
           'however',
           'hundred',
           'i',
           'ie',
           'if',
           'in',
           'inc',
           'indeed',
           'interest',
           'into',
           'is',
           'it',
           'its',
           'itself',
           'just',
           'keep',
           'kg',
           'km',
           'last',
           'latter',
           'latterly',
           'least',
           'less',
           'ltd',
           'made',
           'make',
           'many',
           'may',
           'me',
           'meanwhile',
           'might',
           'mill',
           'mine',
           'more',
           'moreover',
           'most',
           'mostly',
           'move',
           'much',
           'must',
           'my',
           'myself',
           'name',
           'namely',
           'neither',
           'never',
           'nevertheless',
           'next',
           'nine',
           'no',
           'nobody',
           'none',
           'noone',
           'nor',
           'not',
           'nothing',
           'now',
           'nowhere',
           'of',
           'off',
           'often',
           'on',
           'once',
           'one',
           'only',
           'onto',
           'or',
           'other',
           'others',
           'otherwise',
           'our',
           'ours',
           'ourselves',
           'out',
           'over',
           'own',
           'part',
           'per',
           'perhaps',
           'please',
           'put',
           'quite',
           'rather',
           're',
           'really',
           'regarding',
           'same',
           'say',
           'see',
           'seem',
           'seemed',
           'seeming',
           'seems',
           'serious',
           'several',
           'she',
           'should',
           'show',
           'side',
           'since',
           'sincere',
           'six',
           'sixty',
           'so',
           'some',
           'somehow',
           'someone',
           'something',
           'sometime',
           'sometimes',
           'somewhere',
           'still',
           'such',
           'system',
           'take',
           'ten',
           'than',
           'that',
           'the',
           'their',
           'them',
           'themselves',
           'then',
           'thence',
           'there',
           'thereafter',
           'thereby',
           'therefore',
           'therein',
           'thereupon',
           'these',
           'they',
           'thick',
           'thin',
           'third',
           'this',
           'those',
           'though',
           'three',
           'through',
           'throughout',
           'thru',
           'thus',
           'to',
           'together',
           'too',
           'top',
           'toward',
           'towards',
           'twelve',
           'twenty',
           'two',
           'un',
           'under',
           'unless',
           'until',
           'up',
           'upon',
           'us',
           'used',
           'using',
           'various',
           'very',
           'via',
           'was',
           'we',
           'well',
           'were',
           'what',
           'whatever',
           'when',
           'whence',
           'whenever',
           'where',
           'whereafter',
           'whereas',
           'whereby',
           'wherein',
           'whereupon',
           'wherever',
           'whether',
           'which',
           'while',
           'whither',
           'who',
           'whoever',
           'whole',
           'whom',
           'whose',
           'why',
           'will',
           'with',
           'within',
           'without',
           'would',
           'yet',
           'you',
           'your',
           'yours',
           'yourself',
           'yourselves')

In [17]:
new_stopwords = new_stopwords - {'not'}

In [18]:
new_stopwords

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'b',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'c',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'e',
 'each',
 'f',
 'few',
 'for',
 'from',
 'further',
 'g',
 'h',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'j',
 'just',
 'k',
 'l',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'n',
 'needn',
 "needn't",
 'no',
 'nor',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',

In [19]:
# check how WordNetLemmatizer is working  
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 
  
print("rocks :", lemmatizer.lemmatize("rocks")) 
print("corpora :", lemmatizer.lemmatize("corpora")) 
  
# a denotes adjective in "pos" 
print("better :", lemmatizer.lemmatize("better", pos ="a"))
print("better :", lemmatizer.lemmatize("worked", pos ="a"))

rocks : rock
corpora : corpus
better : good
better : worked


## Functions for text pre-processing

In [20]:
def preprocess_baseline_models(text):
    tagged_sentence = nltk.tag.pos_tag(text.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    text = (' '.join(edited_sentence))
    text = re.sub(r'\s\s+', ' ', text)
    text = text.lstrip(' ')  
    text = text.lower()
    text = text.replace('&', ' and ')
    text = re.sub(r'\s\s+', ' ', text)
    text = text.replace('@', ' at ')

    tokenized_words = word_tokenize(text.lower())
    tokenized_words = [nltk.stem.WordNetLemmatizer().lemmatize(w) for w in tokenized_words]
    tokenized_words = [word for word in tokenized_words if word not in new_stopwords]
    tokenized_words = [word for word in tokenized_words if len(word)>2]
    tokenized_words = [word for word in tokenized_words if not word.isnumeric()]
    tokenized_words = " ".join(tokenized_words)   
    text = tokenized_words.replace("n't", 'not')
    return text

In [24]:
example = "I've got a bit of a working machine, & 2016 I.I must ensure, and not always look down. don't know what to do with it. Nuffield Hospital thinks it was not very good to keep it the original way."

In [25]:
preprocess_baseline_models(example)

'got bit working machine must ensure not always look not know think not good keep original way'

In [27]:
# new column with pre-processed text
df['full_text_preprocess'] = df['full_text'].apply(preprocess_baseline_models)

## Limit text data (finding key words and phrases)

In [28]:
# functions to find the phrases in the text (beginning) and return the position

def find_phrase_5 (column):
    if column.find('Overall summary')<3500: 
        start = column.find('Overall summary')
    else:
        start = -1
    return start

def find_phrase_6 (column):
    if column.find('Summary of findings')<5000: 
        start = column.find('Summary of findings')
    else:
        start = -1
    return start

def find_phrase_7 (column):
    if column.find('Overall  Information')<3500: 
        start = column.find('Overall  Information')
    else:
        start = -1
    return start

df['start_5'] = df['full_text'].apply(find_phrase_5)
df['start_6'] = df['full_text'].apply(find_phrase_6)
df['start_7'] = df['full_text'].apply(find_phrase_7)

# check for errors - if happens that text doesnt have any of the above phrases
print(df.loc[(df.start_5 == -1) & (df.start_6 == -1) & (df.start_7 == -1) ])



Empty DataFrame
Columns: [doc_index_first_line, doc_details, doc_index_last_line, full_text, filename, providerId, locationId, organisationType, type, name, region, postalCode, onspdLatitude, onspdLongitude, rating_overall, reportDate, rating_caring, rating_effective, rating_responsive, rating_safe, rating_wellled, URL, Location_type, Location_subtype, Report_URL, Mental_flag, word_count, sentence_count, distinct_word_count, avg_word_len, full_text_preprocess, start_5, start_6, start_7]
Index: []

[0 rows x 34 columns]


In [44]:
df['full_text_preprocess'][0]

'doc url= http //www.cqc.org.uk/sites/default/files/new_reports/aaaa1704.pdf parent_folder= web1 id= file14084844 filename= aaaa1704.pdf 3nx www.elht.nhs.uk inspection visit publication report describes judgement quality care hospital based combination found inspected information intelligent system information given patient public organisation rating hospital family planning finding one seven hospital care centre form location provides maternity service free-standing birth centre woman problem not anticipated give birth baby relaxed home-from-home atmosphere although registered name known locally referred way rest report unit comprises four delivery room four-bed post-natal bay allow mother baby remain period delivery approximately baby born year carried comprehensive inspection flagged high risk monitoring system look wide range data including patient staff survey hospital performance information view public local partner organisation inspection took place 2014. birth centre providing

In [40]:
# create a start column fro the new start position for a text

start = []
for i in range(len(df['start_5'])):
    one_value = max(df.iloc[i, 31:33])
    start.append(one_value)

df['start'] = start

# function to limit text from the start posiont

def limit_text(df, row):
#     df.iloc[2, 23]
    start = df.iloc[row, 34]
    column = df.iloc[row, 3]
    return column[start:]

text_list = []
for row in range(len(df['start_5'])):
    one_text = (limit_text(df, row))
    text_list.append(one_text)
len(text_list)

df['full_text_limited'] = text_list

# cleaning dataset

df = df.drop(['start_5', 'start_6', 'start_7' ], axis = 1)

df[:5]

Unnamed: 0,doc_index_first_line,doc_details,doc_index_last_line,full_text,filename,providerId,locationId,organisationType,type,name,...,Location_subtype,Report_URL,Mental_flag,word_count,sentence_count,distinct_word_count,avg_word_len,full_text_preprocess,start,full_text_limited
0,0,"<doc url=""https://www.cqc.org.uk/sites/default...",1200,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA1704,RXR,RXR78,Location,NHS Healthcare Organisation,Blackburn Birthing Centre,...,Acute hospital - NHS non-specialist,https://www.cqc.org.uk/sites/default/files/new...,0,7741,366,1398,4.831675,doc url= http //www.cqc.org.uk/sites/default/f...,817,Summary of findings Letter from the Chief Insp...
1,1200,"<doc url=""https://www.cqc.org.uk/sites/default...",2270,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA1791,RXH,RXH35,Location,NHS Healthcare Organisation,Bexhill Hospital,...,Acute hospital - NHS non-specialist,https://www.cqc.org.uk/sites/default/files/new...,0,6792,269,1233,4.695671,doc url= http //www.cqc.org.uk/sites/default/f...,806,Summary of findings Letter from the Chief Insp...
2,2270,"<doc url=""https://www.cqc.org.uk/sites/default...",4073,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA1812,RXQ,RXQ51,Location,NHS Healthcare Organisation,Amersham Hospital,...,Acute hospital - NHS non-specialist,https://www.cqc.org.uk/sites/default/files/new...,0,11024,507,1763,4.864478,doc url= http //www.cqc.org.uk/sites/default/f...,914,Summary of findings Letter from the Chief Insp...
3,4073,"<doc url=""https://www.cqc.org.uk/sites/default...",11841,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA2909,R1F,R1FAV,Location,NHS Healthcare Organisation,St Mary's Hospital (Mental Health Management),...,Mental health - community & residential - NHS,https://www.cqc.org.uk/sites/default/files/new...,1,55730,2310,3445,4.649955,doc url= http //www.cqc.org.uk/sites/default/f...,1505,Overall summary 4 The five questions we ask ...
4,11841,"<doc url=""https://www.cqc.org.uk/sites/default...",17390,"<doc url=""https://www.cqc.org.uk/sites/default...",AAAA2910,R1F,R1FX5,Location,NHS Healthcare Organisation,"Community Healthcare Services, St Mary's Hospital",...,Community health - NHS & Independent,https://www.cqc.org.uk/sites/default/files/new...,0,41845,1740,3373,4.776604,doc url= http //www.cqc.org.uk/sites/default/f...,1207,Overall summary 4 The five questions we ask ...


## My transformation on full text limit preprocess

In [41]:
# pre-processing limit text 
df['full_text_limited_preprocess'] = df['full_text_limited'].apply(preprocess_baseline_models)


## NLP process transformation

'gentler' transformation, suitable more for the deep learning models

In [42]:
def preprocess_nlpprocess_models(text):
    tagged_sentence = nltk.tag.pos_tag(text.split())
    edited_sentence = [word for word,tag in tagged_sentence if tag != 'NNP' and tag != 'NNPS']
    tokenized_words = [word for word in edited_sentence if not word.isnumeric()]
    text = " ".join(tokenized_words) 
    
    return text

In [43]:
df['full_text_nlpprocess'] = df['full_text'].apply(preprocess_nlpprocess_models)
df['full_text_limited_nlpprocess'] = df['full_text_limited'].apply(preprocess_nlpprocess_models)

In [None]:
example = "I've got a bit of a working machine, 2016 I , Why the full-stop don't know what to do with it. Nuffield Hospital thinks it was not very good to keep it the original way."

In [None]:
preprocess_nlpprocess_models(example)

## Other text details, word 'not'

In [None]:
def count_not(text):
    count_not = len(re.findall('not', text))
    return count_not

df['word_count_nlpprocess'] = df['full_text_nlpprocess'].apply(word_count)

df['count_not'] = df['full_text_nlpprocess'].apply(count_not)

def prop_not(text):
    prop_not = len(re.findall('not', text))/df['word_count_nlpprocess'].shape[0]
    return prop_not

df['prop_not'] = df['full_text_nlpprocess'].apply(prop_not)

In [None]:
df.groupby('rating_overall')['prop_not'].mean()

In [None]:
# df.to_csv('CQC_documents_df_revised_tableau_v1.csv')

## Uni and Bigrams for Tableau Vizualisation

In [None]:
def bigramReturner (text):
    text = text.lower()
    bigramFeatureVector = []
    for item in nltk.bigrams(text.split()):
        bigramFeatureVector.append(' '.join(item))
    return bigramFeatureVector

In [None]:
df['bigram_full_text_preprocess'] = df['full_text_preprocess'].apply(bigramReturner)

In [None]:
df_Good_score  = df.loc[df['rating_overall']== 'Good']
df_RI_score  = df.loc[df['rating_overall']== 'Requires improvement']
df_Outstanding_score  = df.loc[df['rating_overall']== 'Outstanding']
df_Inadequate_score  = df.loc[df['rating_overall']== 'Inadequate']

In [None]:
fd_bigrams_clean_Good= nltk.FreqDist(np.concatenate(list(df_Good_score['bigram_full_text_preprocess'])))
fd_bigrams_clean_RI= nltk.FreqDist(np.concatenate(list(df_RI_score['bigram_full_text_preprocess'])))
fd_bigrams_clean_Outstanding= nltk.FreqDist(np.concatenate(list(df_Outstanding_score['bigram_full_text_preprocess'])))
fd_bigrams_clean_Inadequate= nltk.FreqDist(np.concatenate(list(df_Inadequate_score['bigram_full_text_preprocess'])))

In [None]:
fullStr_Good = df_Good_score['full_text'].str.cat(sep= ' ')
fullStr_RI = df_RI_score['full_text'].str.cat(sep= ' ')
fullStr_Outstanding = df_Outstanding_score['full_text'].str.cat(sep= ' ')
fullStr_Inadequate = df_Inadequate_score['full_text'].str.cat(sep= ' ')

fullStr_Good_clean = preprocess_baseline_models(fullStr_Good)
fd_words_clean_Good = nltk.FreqDist([w.lower() for w in word_tokenize(fullStr_Good_clean)])
# fd_words_clean_Good.plot(20)

fullStr_RI_clean = preprocess_baseline_models(fullStr_RI)
fd_words_clean_RI = nltk.FreqDist([w.lower() for w in word_tokenize(fullStr_RI_clean)])
# fd_words_clean_RI.plot(20)

fullStr_Outstanding_clean = preprocess_baseline_models(fullStr_Outstanding)
fd_words_clean_Outstanding = nltk.FreqDist([w.lower() for w in word_tokenize(fullStr_Outstanding_clean)])
# fd_words_clean_Outstanding.plot(20)

fullStr_Inadequate_clean = preprocess_baseline_models(fullStr_Inadequate)
fd_words_clean_Inadequate = nltk.FreqDist([w.lower() for w in word_tokenize(fullStr_Inadequate_clean)])
# fd_words_clean_Inadequate.plot(20)


list_of_words_Good = list(set(word_tokenize(fullStr_Good_clean.lower())))  
print('Number of words in Good category:', len(list_of_words_Good))
list_of_words_RI = list(set(word_tokenize(fullStr_RI_clean.lower())))  
print('Number of words in Requires Improvement category:', len(list_of_words_RI))
list_of_words_Outstanding = list(set(word_tokenize(fullStr_Outstanding_clean.lower())))  
print('Number of words in Outstanding category:', len(list_of_words_Outstanding))
list_of_words_Inadequate = list(set(word_tokenize(fullStr_Inadequate_clean.lower())))  
print('Number of words in Inadequate category:', len(list_of_words_Inadequate))

In [None]:
df_Good = pd.DataFrame([dict(fd_words_clean_Good)]).T.reset_index()
df_Inadequate = pd.DataFrame([dict(fd_words_clean_Inadequate)]).T.reset_index()
df_Outstanding = pd.DataFrame([dict(fd_words_clean_Outstanding)]).T.reset_index()
df_RI = pd.DataFrame([dict(fd_words_clean_RI)]).T.reset_index()

df_Good['Score'] = 'Good'
df_Inadequate['Score'] = 'Inadequate'
df_Outstanding['Score'] = 'Outstanding'
df_RI['Score'] = 'RI'

frames = [df_Good, df_Inadequate, df_Outstanding, df_RI]
df_Word_Frequency_by_class_unigrams = pd.concat(frames).rename(columns = {0: 'Frequency', 'index':'Word'})
df_Word_Frequency_by_class_unigrams['type'] = 'unigram'

In [None]:
df_Good = pd.DataFrame([dict(fd_bigrams_clean_Good)]).T.reset_index()
df_Inadequate = pd.DataFrame([dict(fd_bigrams_clean_Inadequate)]).T.reset_index()
df_Outstanding = pd.DataFrame([dict(fd_bigrams_clean_Outstanding)]).T.reset_index()
df_RI = pd.DataFrame([dict(fd_bigrams_clean_RI)]).T.reset_index()

df_Good['Score'] = 'Good'
df_Inadequate['Score'] = 'Inadequate'
df_Outstanding['Score'] = 'Outstanding'
df_RI['Score'] = 'RI'

frames = [df_Good, df_Inadequate, df_Outstanding, df_RI]
df_Word_Frequency_by_class_bigrams = pd.concat(frames).rename(columns = {0: 'Frequency', 'index':'Word'})
df_Word_Frequency_by_class_bigrams['type'] = 'bigram'

In [None]:
frames = [df_Word_Frequency_by_class_unigrams, df_Word_Frequency_by_class_bigrams]
df_Word_Frequency_by_class = pd.concat(frames)

In [None]:
# Exporting information about word frequency
# df_Word_Frequency_by_class.to_csv('Word_Freq_by_class_tableau_v1.csv')