In [None]:
%pip install nltk wordcloud matplotlib

In [None]:
#import libraries

import os
import nltk
import wordcloud
import matplotlib.pyplot as plt
import seaborn as sns
import ssl

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


In [None]:
# read the whole novel Great Expectations
with open('data/great_expectations.txt', 'r', encoding='utf-8') as file:
    filedata = file.read()

# check the contents
print(filedata[0:500])

In [None]:
#read text using the Corpus Reader
from nltk.corpus import PlaintextCorpusReader

#read the file into a corpus
corpus = PlaintextCorpusReader(os.getcwd() + '/data/', 'great_expectations.txt')

#print the beginning of the corpus
print(corpus.raw()[:500])

In [None]:
#extract the file ids
file_ids = corpus.fileids()
print('Files in this corpus:', file_ids)

#extract the paragraphs
paragraphs = corpus.paras('great_expectations.txt')
print('No. of paragraphs:', len(paragraphs))

#extract the sentences
sentences = corpus.sents('great_expectations.txt')
print('No. of sentences:', len(sentences))

words = corpus.words()
print('No. of words:', len(words))

In [None]:
#extract tokens
token_list = nltk.word_tokenize(corpus.raw('great_expectations.txt'))
print('No. of tokens:', len(token_list))
print('First 10 tokens:', token_list[0:20])

In [None]:
# remove punctuation marks

token_list2 = list(filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, token_list))

print('Total tokens without punctuation:', len(token_list2))
print('Some tokens:', token_list2[0:20])

In [None]:
#convert to lower case
token_list3 = [token.lower() for token in token_list2]
print('Some tokens in lower case:', token_list3[0:20])

In [None]:
#eliminate words of length 2 or less
token_list4 = [word for word in token_list3 if len(word) > 2]
print('Some tokens after removing short words:', token_list4[0:20])

In [None]:
#find the frequency distribution of the words
freq_dist = nltk.FreqDist(token_list4)
print('Most common words:', freq_dist.most_common(10))

In [None]:
#get stop words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
print('Some stop words:', list(stop_words)[0:20])


#remove stop words from the token list
token_list5 = list(filter(lambda word: word not in stop_words, token_list4))
print('Some tokens after removing stop words:', token_list5[0:20])
print('Total words after removing stop words:', len(token_list5))

In [None]:
# visualization of the cumulative distribution of the top 50 words
plt.figure(figsize=(12,6))
freq_dist.plot(50, cumulative=True)
plt.show()

In [None]:
# add to the stopwords list
new_stop_words = ['miss', 'mr', 'said', 'one', 'upon', 'come', 'little', 'know', 'away', 'good', 'great']
stop_words.update(new_stop_words)

In [None]:
# remove stopwords
token_list6 = list(filter(lambda word: word not in stop_words, token_list5))
print('Some tokens after removing additional stop words:', token_list6[0:20])
print('Total words after removing additional stop words:', len(token_list6))

In [None]:
#represent the novel as a visual
from wordcloud import WordCloud

#generate the wordcloud data
wordcloud = WordCloud(stopwords = stop_words, max_words=100, background_color="white").generate(' '.join(token_list6))

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
###### Lemmatization
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')

#use the wordnet library to map their lemmatized form


In [None]:
lemmatizer = WordNetLemmatizer()
token_list7 = [lemmatizer.lemmatize(word) for word in token_list6]

print('Total tokens after lemmatization:', len(token_list7))
print('Some tokens after lemmatization:', token_list7[0:20])


In [None]:
from nltk.util import ngrams
from collections import Counter

In [None]:
# find bigrams
bigrams = ngrams(token_list7, 2)
bigram_freq = Counter(bigrams)

print('Most common bigrams:', bigram_freq.most_common(10))

In [None]:
#find trigrams and print the most common 10
trigrams = ngrams(token_list7, 3)
trigram_freq = Counter(trigrams)
print('Most common trigrams:', trigram_freq.most_common(10))

In [None]:
### Parts of Speech Tagging
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')

pos_tags = nltk.pos_tag(token_list6)
print('Some parts of speech tags:', pos_tags[0:20])

#tag and print the first 20 tokens
nltk.pos_tag(token_list6[0:20])


In [None]:
#sentiment analysis

from nltk.tokenize import sent_tokenize
filedata = filedata.replace('\n', ' ')
sentences = sent_tokenize(filedata)

In [None]:
print(sentences[0:10])

In [None]:
#create a dataframe with sentences

import pandas as pd
df = pd.DataFrame(sentences, columns=['sentence'])
print(df.head())

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
%pip install vaderSentiment

In [None]:
#### Sentiment Analysis using VADER
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

In [None]:
df['compound'] = df['sentence'].apply(lambda x: analyzer.polarity_scores(x)['compound'])
df['neg'] = [analyzer.polarity_scores(x)['neg'] for x in df['sentence']]
df['neu'] = [analyzer.polarity_scores(x)['neu'] for x in df['sentence']]
df['pos'] = [analyzer.polarity_scores(x)['pos'] for x in df['sentence']]
df.head(10)

In [None]:
#get the number of positive, neutral, and negative scores
pos_sent = df.loc[df['compound'] > 0]
neg_sent = df.loc[df['compound'] < 0]
neu_sent = df.loc[df['compound'] == 0]

In [None]:
print(df.shape)
print(len(pos_sent))
print(len(neg_sent))
print(len(neu_sent))


In [None]:
# visualize distribution of sentiment scores
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(10,6))
sns.histplot(df['compound'], bins=30, kde=True)
plt.title('Distribution of Compound Sentiment Scores')
plt.xlabel('Compound Score')
plt.ylabel('Frequency')
plt.show()