# Text Preprocessing:


### Imports Needed

In [30]:
#pip install nltk  --- use pip install if nltk is not present.

In [34]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Removing Noise:
Noise removal involves getting rid of unwanted characters, symbols, and formatting from the text.

In [36]:
def remove_noise1(text):
    # Remove special characters, digits, and replaces multiple spaces with a single space
    cleaned_text = re.sub(r'[^\w\s]', '', text)
    cleaned_text = re.sub(r'\d+', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    return cleaned_text


In [38]:
# Example usage
text = "Hellooo.    How r u???&^&^ Welcome##### 43434"
cleaned_text = remove_noise1(text)
print(cleaned_text)  

Hellooo How r u Welcome


# Convert Text to Lowercase:

In [39]:
def convert_to_lowercase(text):
    return text.lower()


In [40]:
# Example usage
text = "HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434"
cleaned_text = convert_to_lowercase(text)
print(cleaned_text)  

helllooooooo.    how r uuuuuu ???&^&^ welcome##### 43434


# Removing Numerical Digits:

In [41]:
def remove_digits(text):
    return re.sub(r'\d+', '', text)


In [42]:
# Example usage
text = "HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434"
cleaned_text = remove_digits(text)
print(cleaned_text) 

HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 


# Removing URLs:

In [43]:
def remove_urls(text):
    return re.sub(r'http\S+|www\S+', '', text)

In [44]:
# Example usage
text = "HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434 http://www.google.in"
cleaned_text = remove_urls(text)
print(cleaned_text)

HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434 


# Removing HTML Tags:

In [45]:
def remove_html_tags(text):
    from bs4 import BeautifulSoup
    return BeautifulSoup(text, "html.parser").get_text()


In [46]:
# Example usage
text = "HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434  <B> WOW <B>"
cleaned_text = remove_html_tags(text)
print(cleaned_text)

HELLLOOOOOOO.    How R UUUUUU ???&^&^ Welcome##### 43434   WOW 


# Handling Stopwords:
Stopwords are common words like "the," "and," "is," etc., which often don't add much meaning to the text and can be removed.

In [47]:

#nltk.download('stopwords')


def remove_stopwords(tokens):
    stop_words = set(stopwords.words("english"))
    filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
    return filtered_tokens



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/pallavisatsangi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [48]:
# Example usage
tokens = ['Tokenization', 'breaks', 'the', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']
filtered_tokens = remove_stopwords(tokens)
print(filtered_tokens)  # Output: ['Tokenization', 'breaks', 'text', 'individual', 'words', 'tokens', '.']


['Tokenization', 'breaks', 'text', 'individual', 'words', 'tokens', '.']


# Tokenization:
Tokenization breaks the text into individual words or tokens.

In [50]:

#nltk.download('punkt')

def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens


In [51]:
# Example usage
text = "Tokenization breaks the text into individual words or tokens."
tokens = tokenize_text(text)
print(tokens)  # Output: ['Tokenization', 'breaks', 'the', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']


['Tokenization', 'breaks', 'the', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']


# Stemming:
Stemming reduces words to their root or base form by removing suffixes.

In [52]:

def stem_text(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens



In [54]:
# Example usage
tokens = ['Tokenization', 'breaks', 'the', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']
stemmed_tokens = stem_text(tokens)
print(stemmed_tokens)  # Output: ['token', 'break', 'the', 'text', 'into', 'individu', 'word', 'or', 'token', '.']


['token', 'break', 'the', 'text', 'into', 'individu', 'word', 'or', 'token', '.']


# Lemmatization:
Lemmatization reduces words to their base or dictionary form (lemma).

In [56]:

#nltk.download('wordnet')

def lemmatize_text(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens



In [57]:
# Example usage
tokens = ['Tokenization', 'breaks', 'the', 'text', 'into', 'individual', 'words', 'or', 'tokens', '.']
lemmatized_tokens = lemmatize_text(tokens)
print(lemmatized_tokens)  # Output: ['Tokenization', 'break', 'the', 'text', 'into', 'individual', 'word', 'or', 'token', '.']


['Tokenization', 'break', 'the', 'text', 'into', 'individual', 'word', 'or', 'token', '.']


# Compare,Combine, remove duplicate words:

In [58]:
def compare_strings(string1, string2):
    return string1 == string2

def combine_two_strings(string1, string2):
    return string1 + " " + string2

def combine_strings_remove_duplicates(string1, string2):
    words1 = set(string1.split())
    words2 = set(string2.split())
    combined_words = words1.union(words2)
    return " ".join(combined_words)

# Example usage
string1 = "hello world"
string2 = "hello world"
if compare_strings(string1, string2):
    print("Strings are equal.")
else:
    print("Strings are not equal.")

combined_string = combine_two_strings(string1, string2)
print("Combined two strings:", combined_string)

combined_without_duplicates = combine_strings_remove_duplicates(string1, string2)
print("Combined strings without duplicates:", combined_without_duplicates)


Strings are equal.
Combined two strings: hello world hello world
Combined strings without duplicates: world hello
