# All of the follwoing is code that is typically used with NLP and Text Analysis

# 1. Text Cleaning / NLP Pipeline


In [None]:
'''
Importing all required libraries for text cleaning.
Includes libraries for text processing, web scraping, tokenization, and more.
'''

import re  # For regular expressions
import string  # For string operations
import nltk  # For natural language processing
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup  # For web scraping (if needed)
import contractions  # For expanding contractions (e.g., can't -> cannot)
import spacy  # For advanced NLP tasks
from nltk.tokenize.toktok import ToktokTokenizer  # Toktok tokenizer for tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Spacy model download command (if not already installed)
# !python -m spacy download en_core_web_sm

# Load Spacy language model
nlp = spacy.load('en_core_web_sm')

# Initialize Toktok tokenizer
tokenizer = ToktokTokenizer()


In [None]:
'''
Function to convert all text to lowercase for consistency.
'''
def lowercase_text(text):
    return text.lower()

text = "Natural Language Processing is Fun!"
lowercased_text = lowercase_text(text)
print(lowercased_text)


In [None]:
'''
Function to remove punctuation from the text.
'''
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

text = "Hello! How are you? NLP is great."
cleaned_text = remove_punctuation(text)
print(cleaned_text)


In [None]:
'''
Another way to remove specific items using the re library.
Useful to locate regualar expressions. USE chatgpt to help with this. Ask for how to remove regualr expresssions in python using re library and the syntax
'''
stripped_text = soup.get_text()
stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
stripped_text = re.sub(r"[^'\w\s\.]", '', stripped_text)
stripped_text = re.sub(r'\d+', '', stripped_text)

In [None]:
'''
Function to remove any numbers from the text.
d stands for digits the + means one or more
'''
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

text = "I have 2 cats and 1 dog."
no_numbers_text = remove_numbers(text)
print(no_numbers_text)

In [None]:
'''
Function to remove extra spaces from the text.
'''
def remove_extra_spaces(text):
    return ' '.join(text.split())

text = "This  is   an example   with  extra spaces."
cleaned_text = remove_extra_spaces(text)
print(cleaned_text)

In [None]:
'''
Function to expand contractions (e.g., can't -> cannot).
'''
def expand_contractions(text):
    return contractions.fix(text)

text = "I can't believe it's happening."
expanded_text = expand_contractions(text)
print(expanded_text)


In [None]:
'''
Function to remove stop words from the text using the Toktok tokenizer and NLTK stopwords list.
'''
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = tokenizer.tokenize(text)
    filtered_text = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_text)

text = "This is an example sentence showing stopword removal."
cleaned_text = remove_stopwords(text)
print(cleaned_text)

In [None]:
'''
Function to tokenize text into words using NLTK's word tokenizer.
'''
def tokenize_text(text):
    return word_tokenize(text)

text = "Natural language processing with Python."
tokens = tokenize_text(text)
print(tokens)

In [None]:
'''
Function to lemmatize the text using Spacy.
Lemmatization converts words to their base form (e.g., "running" -> "run").
'''
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_text = ' '.join([token.lemma_ for token in doc])
    return lemmatized_text

text = "The striped bats are hanging on their feet for best."
lemmatized_text = lemmatize_text(text)
print(lemmatized_text)

 -Function that combines all the processes above

In [None]:
'''
Function that combines all the text cleaning steps into one.
Includes lowercasing, punctuation removal, number removal, extra space removal, contraction expansion, stop word removal, and lemmatization.
'''
def clean_text(text):
    # Lowercase
    text = lowercase_text(text)

    # Remove punctuation
    text = remove_punctuation(text)

    # Remove numbers
    text = remove_numbers(text)

    # Remove extra spaces
    text = remove_extra_spaces(text)

    # Expand contractions
    text = expand_contractions(text)

    # Remove stopwords
    text = remove_stopwords(text)

    # Lemmatize text
    text = lemmatize_text(text)

    return text

text = "Natural Language Processing can't be ignored in 2024! It's crucial."
cleaned_text = clean_text(text)
print(cleaned_text)


# 2. Basic Text Statistics

* **word_tokenize()**: Breaks a string into individual words (tokens).
* **sent_tokenize()**: Splits a string into individual sentences.
* **Counter()**: Creates a frequency distribution for tokens or characters.
* **most_common()**: Returns the most frequent items in a list or dictionary.
* **set()**: Stores unique elements, automatically removing duplicates.
* **re.sub()**: Substitutes all occurrences of a regex pattern with another string.

- Libraries

In [None]:
'''
Importing necessary libraries for text statistics.
- nltk: Used for tokenization and linguistic processing.
- re: Regular expressions for pattern matching in text.
- collections: Provides useful data structures like Counter to calculate word frequencies.
- pandas: Used for organizing data into DataFrames (tabular format).
'''

import nltk
import re
from collections import Counter
import pandas as pd
from nltk.tokenize import word_tokenize, sent_tokenize

# Download NLTK resources
nltk.download('punkt')


- Word counter


In [None]:
'''
Function to count the total number of words in a text.
- Tokenizing: We use nltk's word_tokenize to break the text into individual words.
- len(): Counts the number of tokens (words) in the list.

Input: A string of text.
Output: The total number of words.
'''
def word_count(text):
    tokens = word_tokenize(text)  # Tokenizing text into words
    return len(tokens)  # Returning the total number of tokens (words)

text = "Natural language processing is fun. Let's count the words!"
total_words = word_count(text)
print("Total word count:", total_words)


- Sentance counter

In [None]:
'''
Function to count the number of sentences in a text.
- sent_tokenize: A function from nltk that splits the text into sentences.
- len(): Counts the number of sentences in the list.

Input: A string of text.
Output: The total number of sentences.
'''
def sentence_count(text):
    sentences = sent_tokenize(text)  # Tokenizing text into sentences
    return len(sentences)  # Returning the total number of sentences

total_sentences = sentence_count(text)
print("Total sentence count:", total_sentences)


- Average word length

In [None]:
 '''
Function to calculate the average length of words in the text.
- Tokenizing text into words.
- Using list comprehension to calculate the length of each word.
- sum(): Adds all word lengths together.
- len(): Divides the total sum by the number of words to get the average.

Input: A string of text.
Output: Average word length (float).
'''
def average_word_length(text):
    tokens = word_tokenize(text)  # Tokenizing text into words
    word_lengths = [len(word) for word in tokens]  # Calculating the length of each word
    return sum(word_lengths) / len(word_lengths)  # Calculating the average word length

avg_word_length = average_word_length(text)
print("Average word length:", avg_word_length)


- Word Frequency (Top N Words)

In [None]:
'''
Function to calculate word frequency and return the top N most common words.
- Counter(): Creates a dictionary where the keys are the words, and the values are the word counts.
- most_common(): A method in Counter that returns the N most common words.

Input: A string of text and the number N of top words to return.
Output: List of tuples containing the top N words and their counts.
'''
def word_frequency(text, N=5):
    tokens = word_tokenize(text)  # Tokenizing text into words
    frequency = Counter(tokens)  # Calculating the frequency of each word
    return frequency.most_common(N)  # Returning the top N most frequent words

top_words = word_frequency(text, 3)
print("Top 3 words:", top_words)
