# Exploring Clinic Letter Corpus

###
    author: naa
    created: 2023-04-03
    version: 0.1.0

   Basic NLP EDA and pre-processing of sentences from corpus

In [None]:
# import relevant packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
import math

In [None]:
dir_root = "/home/jovyan/nhsx_nlp"

In [None]:
%matplotlib inline

# load data

In [None]:
path_data = dir_root + '/data/raw/neurology_letters_2023_03_18.csv'
df = pd.read_csv(path_data)
df

# Describe data

In [None]:
df.info()

In [None]:
# create column for document length in terms of characters
df['char_len'] = df['doctext'].astype(str).apply(len)

# create column for doc length in terms of word count
df['doc_len'] = df['doctext'].apply(lambda x: len(str(x).split()))

#check df after adding character length and word length columns
df

## Look at unique values

In [None]:
df.nunique()

In [None]:
'''seems that all documents are unique, but how do we relate this to the clinic letters and patients?'''

In [None]:
df.describe()

## Visualise data

### histogram of word counts

In [None]:
# Set size and color for plots with Seabon
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(25,30)}, 
    style="white"
)

In [None]:
# Histogram of 'Count of Documents vs Document Length by Number of Words'
df['doc_len'].hist(bins=100)
#'df['doc_len'].plot(kind = "kde")
plt.title('Count of Documents vs Document Length by Number of Words', fontsize='20')
plt.xlabel('Document Length by Number of Words', fontsize='18')
plt.ylabel('Count of Documents', fontsize='18')
plt.show()

### histogram of character lengths per doc

In [None]:
# Set size and color for plots
sns.reset_defaults()
sns.set(
    rc={'figure.figsize':(25,30)}, 
    style="white"
)

In [None]:
# Histogram of 'Count of Documents vs Document Length by Number of Characters'
df['char_len'].hist(bins=100)
plt.title('Count of Documents vs Document Length by Number of Characters', fontsize='20')
plt.xlabel('Document Length by Number of Characters', fontsize='18')
plt.ylabel('Count of Documents', fontsize='18')
plt.show()

# Word-level analysis

## create one text document of all words in corpus

In [None]:
# create one string of all sentences in corpus (separated by a space) then create list of individual words
text = ' '.join(t for t in df['doctext'])
words_list= text.split()

In [None]:
# Create dictionary of each word and their counts
word_freq = {}
for word in set(words_list):
    word_freq[word] = words_list.count(word)  


In [None]:
#Creating dataframe of words
df_word= pd.DataFrame(word_freq.items(),columns=['word','count'])

In [None]:
# create column for length of word

df_word['word_len']= df_word['word'].map(lambda x: len(x))
# sorting values 
df_word=df_word.sort_values('count',ascending=False).reset_index(drop=True)

#check df 
df_word

In [None]:
# plot top 50 words
df_top= df_word.head(30)
sns.barplot(data= df_top, x =df_top['count'], y=df_top['word'])
plt.ylabel('Word', fontsize = 20)
plt.xlabel('Count', fontsize = 20)
plt.tick_params(axis='both', which='major', labelsize=18)
plt.show()