In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Make sure punkt and stopwords are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess(text):
    # Tokenize text (split into words)
    print("Original text:", text)  # Debug: Check input text
    words = word_tokenize(text.lower())  # Convert text to lowercase
    print("Tokenized words:", words)  # Debug: Check tokenized words

    # Remove stop words
    stop_words = set(stopwords.words('english'))
    filtered_words = [word for word in words if word not in stop_words]
    print("Filtered words:", filtered_words)  # Debug: Check filtered words

    # Stem words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_words]
    print("Stemmed words:", stemmed_words)  # Debug: Check stemmed words

    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    print("Lemmatized words:", lemmatized_words)  # Debug: Check lemmatized words

    return stemmed_words, lemmatized_words  # Return both stemmed and lemmatized words

# Example usage:
text = "Your sample text here."
stemmed, lemmatized = preprocess(text)
#print("Stemmed:", stemmed)
#print("Lemmatized:", lemmatized)


Original text: Your sample text here.
Tokenized words: ['your', 'sample', 'text', 'here', '.']
Filtered words: ['sample', 'text', '.']
Stemmed words: ['sampl', 'text', '.']
Lemmatized words: ['sample', 'text', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#or for nltk

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer, SnowballStemmer
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

text = "A Natural Language Processing enables computers to analyze and understand human language."

def tokenize_text(text):
    return nltk.word_tokenize(text)

def remove_stop_words(text):
    words = text.split()
    stop_words = set(stopwords.words('english'))
    return ' '.join([word for word in words if word.lower() not in stop_words])

def stem_text(text):
    text_no_stopwords = remove_stop_words(text)
    snowball_stemmer = SnowballStemmer("english")
    words = text_no_stopwords.split()
    return ' '.join([snowball_stemmer.stem(word) for word in words])

def lemmatize_text(text):
    text_no_stopwords = remove_stop_words(text)
    lemmatizer = WordNetLemmatizer()
    words = text_no_stopwords.split()
    return ' '.join([lemmatizer.lemmatize(word) for word in words])

def switch(ch):
    if ch == 1:
        return tokenize_text(text)
    elif ch == 2:
        return remove_stop_words(text)
    elif ch == 3:
        return stem_text(text)
    elif ch == 4:
        return lemmatize_text(text)
    else:
        return "Invalid choice. Please enter a number between 1 and 4."

print("Choose a preprocessing method by entering the corresponding number:")
print("1 - Tokenization")
print("2 - Stop Words Removal")
print("3 - Stemming (after removing stopwords)")
print("4 - Lemmatization (after removing stopwords)")
ch = int(input("Enter your choice: "))

# Display result based on user input
result = switch(ch)
print("\nOriginal Text:\n", text)
print("\nProcessed Text:\n", result)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Choose a preprocessing method by entering the corresponding number:
1 - Tokenization
2 - Stop Words Removal
3 - Stemming (after removing stopwords)
4 - Lemmatization (after removing stopwords)
Enter your choice: 3

Original Text:
 A Natural Language Processing enables computers to analyze and understand human language.

Processed Text:
 natur languag process enabl comput analyz understand human language.


In [15]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize, sent_tokenize

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Sample random text (100 words)
random_text = """
Data processing encompasses a series of operations that convert raw data into structured
and organized information. This process begins with data collection, where data is gathered
from various sources such as sensors, databases, forms, or external systems. Once collected,
the data can be in various formats, including text, numbers, images, or multimedia.
The next step in data processing is data cleaning and validation. This involves identifying
and correcting errors, inconsistencies, and missing values in the data. Clean and accurate data
is essential for reliable analysis and decision-making. Data cleaning often involves techniques
like outlier detection and data imputation.
After data cleaning, data transformation is performed. This includes tasks like data normalization,
aggregation, and summarization. Normalization ensures that data is on a consistent scale, while
aggregation and summarization reduce data complexity by generating statistics or aggregating data into meaningful groups.
Data processing also includes data integration, where data from multiple sources is combined
into a unified dataset. Integration can be challenging due to differences in data structures and
formats. Techniques like data mapping and data warehousing are used to facilitate integration.
"""

# Tokenize the text into words
words = word_tokenize(random_text)

# Initialize the NLTK Porter Stemmer and WordNet Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Get the English stop words
stop_words = set(stopwords.words("english"))

# Initialize a list to store the preprocessed words
preprocessed_words = []

# Perform text preprocessing
for word in words:
    # Remove punctuation and convert to lowercase
    word = word.lower()
    word = word.strip('.,?!-()[]{}"\'')

    # Check if the word is not a stop word
    if word not in stop_words:
        # Stem the word
        stemmed_word = stemmer.stem(word)

        # Lemmatize the stemmed word
        lemmatized_word = lemmatizer.lemmatize(stemmed_word)

        # Add the lemmatized word to the list
        preprocessed_words.append(lemmatized_word)

# Join the preprocessed words back into a text
preprocessed_text = " ".join(preprocessed_words)

# Print the original text and preprocessed text
print("Original Text:")
print(random_text)
print("\nPreprocessed Text:")
print(preprocessed_text)


Original Text:

Data processing encompasses a series of operations that convert raw data into structured
and organized information. This process begins with data collection, where data is gathered
from various sources such as sensors, databases, forms, or external systems. Once collected,
the data can be in various formats, including text, numbers, images, or multimedia.
The next step in data processing is data cleaning and validation. This involves identifying
and correcting errors, inconsistencies, and missing values in the data. Clean and accurate data
is essential for reliable analysis and decision-making. Data cleaning often involves techniques
like outlier detection and data imputation.
After data cleaning, data transformation is performed. This includes tasks like data normalization,
aggregation, and summarization. Normalization ensures that data is on a consistent scale, while
aggregation and summarization reduce data complexity by generating statistics or aggregating data into

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#security code

In [1]:
import hashlib  # For hashing to ensure data integrity
from cryptography.fernet import Fernet  # For encryption
import random  # For simulating sensor data

In [3]:
# Generate an encryption key
key = Fernet.generate_key()
cipher = Fernet(key)

In [4]:
def get_sensor_data():
    return random.uniform(50, 100)  # Random temperature between 50°C and 100°C

# Encrypt the sensor data
def encrypt_data(data, cipher):
    return cipher.encrypt(data.encode())

# Decrypt the sensor data
def decrypt_data(encrypted_data, cipher):
    return cipher.decrypt(encrypted_data).decode()

# Verify data integrity using hashing
def verify_data_integrity(data):
    return hashlib.sha256(data.encode()).hexdigest()

In [5]:
sensor_data = f"Temperature: {get_sensor_data():.2f}°C"
print("Original Data:", sensor_data)

# Encrypt and then decrypt the data
encrypted_data = encrypt_data(sensor_data, cipher)
print("Encrypted Data:", encrypted_data)

decrypted_data = decrypt_data(encrypted_data, cipher)
print("Decrypted Data:", decrypted_data)

Original Data: Temperature: 96.15°C
Encrypted Data: b'gAAAAABnK53w3ew634WnyKtg7nTvBBO55us2kTmSLZDOm4AZoGFN2w8i1SX8gJ9V2MgBXNdqoK0dZKG0xuU7WNqtLBUUsCEwwRaegA1QR02FadabwjwEEak='
Decrypted Data: Temperature: 96.15°C


In [6]:
original_hash = verify_data_integrity(sensor_data)
decrypted_hash = verify_data_integrity(decrypted_data)
if original_hash == decrypted_hash:
    print("Data Integrity Verified: Hashes match.")
else:
    print("Data Integrity Issue: Hashes do not match.")

Data Integrity Verified: Hashes match.
