In [1]:
import re
import nltk
import textblob
from bleach import clean
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

In [10]:
def preprocessing(text):
    # convert to lowercase
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text, flags=re.I|re.A)
    tokens = word_tokenize(text)
    clean_tokens = []
    # remove stopwords
    stop_words = set(stopwords.words('english'))
    for i, token in enumerate(tokens):
        if token not in stop_words:
            clean_tokens.append(token)

    lemmatizer = WordNetLemmatizer()
    for i, token in enumerate(clean_tokens):
        clean_tokens[i] = lemmatizer.lemmatize(token).lower().strip()

    return ' '.join(clean_tokens)


In [93]:
sample_text = "The film suffers from poor writing, with an underdeveloped plot and dragged-out, unnecessary dialogue that makes it feel unbearably long and boring; even the decent cinematography and occasional good performance can't save it from being a forgettable experience."
print("===Sample Text===")
print(sample_text, '\n')
print("===Cleaned Text===")
print(preprocessing(sample_text))

===Sample Text===
The film suffers from poor writing, with an underdeveloped plot and dragged-out, unnecessary dialogue that makes it feel unbearably long and boring; even the decent cinematography and occasional good performance can't save it from being a forgettable experience. 

===Cleaned Text===
['The', 'film', 'suffers', 'poor', 'writing', ',', 'underdeveloped', 'plot', 'dragged-out', ',', 'unnecessary', 'dialogue', 'makes', 'feel', 'unbearably', 'long', 'boring', ';', 'even', 'decent', 'cinematography', 'occasional', 'good', 'performance', 'ca', "n't", 'save', 'forgettable', 'experience', '.']


# Part 2 Sentiment Analysis
...is a technique that involves the process of determining and extracting sentiment or emotional information from text data. The primary goal of sentiment analysis is to assess and quantify the subjective aspects of a piece of text, typically in the form of opinions, attitudes, emotions, or polarity (positive, negative, or neutral).

In [13]:
from textblob import TextBlob

In [16]:
input_text = sample_text

blob = TextBlob(input_text)

# get sentiment using sentiment function
sentiment = blob.sentiment
print(sentiment)

# get polarity and subjectivity
polarity = sentiment.polarity
subjectivity = sentiment.subjectivity

if polarity > 0:
    sentiment_lbl = "Positive"
elif polarity < 0:
    sentiment_lbl = "Negative"
else:
    sentiment_lbl = "Neutral"

# Print the results
print(f"Text: {input_text}")
print(f"Sentiment: {sentiment_lbl}")
print(f"Polarity: {polarity}")
print(f"Subjectivity: {subjectivity}")

Sentiment(polarity=0.6678571428571428, subjectivity=0.6928571428571428)
Text: Check out this amazing website: https://www.example.com. It has some great resources! 😃

I can't believe it's already summer ☀️. Time flies when you're having fun!

Did you see the latest news about the space mission to Mars? 🚀 It's truly fascinating.

Don't forget to visit our website at https://www.example.com for more information. Have a great day! 🌟
Sentiment: Positive
Polarity: 0.6678571428571428
Subjectivity: 0.6928571428571428


# Part 3 NER
Named Entity Recognition, or NER, is a fundamental technique in natural language processing. It automatically finds and classifies important items in text, such as names of people, organizations, places, dates, and monetary values. NER works by first identifying the boundaries of these entities and then assigning them to categories like PERSON, ORGANIZATION, or LOCATION.

This process turns unstructured text into structured data, which is useful for tasks like extracting information, answering questions, and summarizing documents. For example, in the sentence “Apple Inc. is opening a store in New York,” NER would identify “Apple Inc.” as an organization and “New York” as a location.

We use the spaCy library for NER. With spaCy, you load a language model, process your text, and it automatically finds and labels named entities. You can then print out each entity and its label, such as ‘PERSON’ or ‘ORG’, to see what has been extracted and categorized. This makes it easy to pull structured information from large amounts of text.

it can recognise such entities like persons, locations, orgs, dates, quantities, money, and more.
it can be used for Information Extraction: Structuring unstructured text data for further analysis, Question Answering: Enhancing search engines to provide direct answers to questions, Language Translation: Improving the quality of translation by preserving named entities, Entity Linking: Associating recognised entities with external knowledge bases or databases, text summaries, and even news/social media analysis

In [None]:
# must download model first
# python -m spacy download en_core_web_sm

In [84]:
# Named Entity Recognition
import spacy
# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Input text (replace this with your desired text)
input_text = """
From the garage to the Googleplex.

The Google story begins in 1995 at Stanford University. Larry Page was considering Stanford for grad school and Sergey Brin, a student there, was assigned to show him around. By some accounts, they disagreed about nearly everything during that first meeting, but by the following year they struck a partnership. Working from their dorm rooms, they built a search engine that used links to determine the importance of individual pages on the World Wide Web. They called this search engine Backrub. 

Soon after, Backrub was renamed Google (phew). The name was a play on the mathematical expression for the number 1 followed by 100 zeros and aptly reflected Larry and Sergey's mission to organise the world’s information and make it universally accessible and useful. Over the next few years, Google caught the attention of not only the academic community, but Silicon Valley investors as well. In August 1998, Sun co-founder Andy Bechtolsheim wrote Larry and Sergey a check for $100,000, and Google Inc. was officially born. With this investment, the newly incorporated team made the upgrade from the dorms to their first office: a garage in suburban Menlo Park, California, owned by Susan Wojcicki (employee no.16 and former CEO of YouTube). Clunky desktop computers, a ping pong table and bright blue carpet set the scene for those early days and late nights.(The tradition of keeping things colourful continues to this day.)

Even in the beginning, things were unconventional: from Google’s initial server (made of Lego) to the first 'Doodle' in 1998: a stick figure in the logo announcing to site visitors that the
entire staff was playing hooky at the Burning Man Festival. 'Don't be evil' captured the spirit of our intentionally unconventional methods. In the years that followed, the company expanded
rapidly – hiring engineers, building a sales team and introducing the first company dog, Yoshka. Google outgrew the garage and eventually moved to its current headquarters (aka'The Googleplex') in Mountain View, California. The spirit of doing things differently made the move. So did Yoshka.

The relentless search for better answers continues to be at the core of everything we do. Today, Google makes hundreds of products used by billions of people across the globe,
 from YouTube and Android to Gmail and, of course, Google Search. Although we’ve ditched the Lego servers and added just a few more company dogs, our passion for building technology
for everyone has stayed with us – from the dorm room to the garage and to this very day.
"""

# Process the text using spaCy
doc = nlp(input_text)

# Extract named entities and their labels
entities = [(entity.text, entity.label_) for entity in doc.ents]
print(doc)
# Print the named entities and their labels
for entity, label in entities:
    print(f"Entity: {entity}, Label: {label}")


From the garage to the Googleplex
The Google story begins in 1995 at Stanford University. Larry Page was considering Stanford for grad school and Sergey Brin, a student there, was assigned to show him around. By some accounts, they disagreed about nearly everything during that first meeting, but by the following year they struck a partnership. Working from their dorm rooms, they built a search engine that used links to determine the importance of individual pages on the World Wide Web. They called this search engine Backrub. 

Soon after, Backrub was renamed Google (phew). The name was a play on the mathematical expression for the number 1 followed by 100 zeros and aptly reflected Larry and Sergey's mission to organise the world’s information and make it universally accessible and useful. Over the next few years, Google caught the attention of not only the academic community, but Silicon Valley investors as well. In August 1998, Sun co-founder Andy Bechtolsheim wrote Larry and Sergey 

# Part 4 Text Summarisation

Text summarisation in Natural Language Processing (NLP) is the process of automatically generating a concise, coherent, and informative summary from a longer document, such as an article or report. The goal is to preserve the essential points and main ideas while omitting redundant or less relevant content.

There are two main types of text summarisation:

Extractive Summarisation: This approach selects and combines the most important sentences or phrases directly from the original text, typically using algorithms that score sentence importance. The summary consists of verbatim sections from the source, chosen to best represent the overall content.

Abstractive Summarisation: This method generates new sentences that paraphrase the source material, using advanced natural language generation techniques. Abstractive summaries can capture the meaning of the original text in a more human-like and coherent manner, but are technically more challenging to implement.

Applications of text summarisation include:
- Information retrieval: Helping users quickly understand the content of lengthy documents or web pages.
- News summarisation: Producing brief summaries of news articles for rapid consumption.
- Document summarisation: Creating abstracts or executive summaries for research papers, business reports, or legal documents.
- Content generation: Generating short descriptions for products, search results, or digital assistants.

In [85]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import networkx as nx
import nltk
from nltk.corpus import stopwords
from sklearn. metrics.pairwise import cosine_similarity

In [92]:
def preprocess_sentence(text):

    # tokenize and remove stopwrods
    stop_words = set(stopwords.words('english'))
    sentence = nltk.word_tokenize(text)
    cleaned_sentence = []
    for i, token in enumerate(sentence):
        if token not in stop_words:
            cleaned_sentence.append(token)

    return cleaned_sentence

def build_similarity_matrix(sentences):
    """Build a similarity matrix for the sentences using TF-IDF"""
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
    
    return similarity_matrix

def summarise_text(text, max_sentences=6):
    ranked_sentences = [] 
    summarised_text = ""
    sentences = nltk.sent_tokenize(text)
    preprocessed_sentences = [preprocess_sentence(sentence) for sentence in sentences]
    
    similarity_matrix = build_similarity_matrix([" ".join(sentence) for sentence in preprocessed_sentences])
    
    # generate a matrix of rnakings
    sentence_similarity_graph = nx.from_numpy_array(similarity_matrix)
    scores = nx.pagerank(sentence_similarity_graph)

    # sort and rank top sentences
    # ranked_sentences = sorted(((scores[i], s) for i, s in enumerate(sentences)), reverse=True)
    for i, s in enumerate(sentences):
        ranked_sentences.append((scores[i], s.strip().replace("\n", ". ")))
    sorted_ranked_sentences = sorted(ranked_sentences, reverse=True)
    # join the ranked sentences to the summarised text, up to the max sentence provided parameter
    for ranked_sentence in sorted_ranked_sentences[:max_sentences]:
        summarised_text += "".join(ranked_sentence[1])
    
    return summarised_text

summarise_text(input_text, 5)

"From the garage to the Googleplex. The Google story begins in 1995 at Stanford University.The spirit of doing things differently made the move.In the years that followed, the company expanded. rapidly – hiring engineers, building a sales team and introducing the first company dog, Yoshka.The relentless search for better answers continues to be at the core of everything we do.Google outgrew the garage and eventually moved to its current headquarters (aka'The Googleplex') in Mountain View, California."

In [47]:
preprocessed_text = preprocessing(input_text)

In [48]:
print(summarise_text(preprocessed_text, 4))

TypeError: expected string or bytes-like object, got 'list'