In [1]:
!pip install scikit-learn nltk




In [2]:
import nltk
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string

nltk.download('punkt')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
doc1 = "Natural Language Processing enables computers to understand human language."
doc2 = "NLP helps machines process and analyze large amounts of natural language data."


In [5]:
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [6]:
# 1.Similarity
def preprocess(text):
    tokens = word_tokenize(text.lower())  # Tokenization and lowercasing
    tokens = [word for word in tokens if word not in stopwords.words('english') and word not in string.punctuation]
    return " ".join(tokens)

doc1_clean = preprocess(doc1)
doc2_clean = preprocess(doc2)


In [7]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform([doc1_clean, doc2_clean])
cosine_sim = cosine_similarity(vectors[0], vectors[1])[0][0]

print(f"Cosine Similarity: {cosine_sim:.4f}")


Cosine Similarity: 0.1843


In [8]:
def jaccard_similarity(doc1, doc2):
    words_doc1 = set(doc1.split())
    words_doc2 = set(doc2.split())
    intersection = words_doc1.intersection(words_doc2)
    union = words_doc1.union(words_doc2)
    return len(intersection) / len(union)

jaccard_sim = jaccard_similarity(doc1_clean, doc2_clean)
print(f"Jaccard Similarity: {jaccard_sim:.4f}")


Jaccard Similarity: 0.1333


In [9]:
!pip install textblob




In [10]:
from textblob import TextBlob


In [11]:
text1 = "I love this product! It's absolutely amazing and works perfectly."
text2 = "This product is terrible. It broke within two days of use."
text3 = "The product is okay, not too great but not too bad either."


In [12]:
# 2. Analyze Sentiment Using TextBlob
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    subjectivity = blob.sentiment.subjectivity
    return polarity, subjectivity

for text in [text1, text2, text3]:
    polarity, subjectivity = analyze_sentiment(text)
    print(f"Text: {text}\nPolarity: {polarity:.2f}, Subjectivity: {subjectivity:.2f}\n")


Text: I love this product! It's absolutely amazing and works perfectly.
Polarity: 0.74, Subjectivity: 0.83

Text: This product is terrible. It broke within two days of use.
Polarity: -1.00, Subjectivity: 1.00

Text: The product is okay, not too great but not too bad either.
Polarity: 0.20, Subjectivity: 0.64



In [22]:
# 3.Sentiment Analysis Using Bayesian Classification
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.metrics import accuracy_score, classification_report
from nltk.stem import PorterStemmer

In [23]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [24]:
# Load dataset (IMDb reviews from NLTK)
from nltk.corpus import movie_reviews
nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


True

In [25]:
def load_data():
    documents = [(movie_reviews.raw(fileid), category)
                 for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]
    df = pd.DataFrame(documents, columns=['text', 'label'])
    df['label'] = df['label'].map({'pos': 1, 'neg': 0})
    return df

In [26]:
# Text preprocessing
def preprocess_text(text):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    text = text.lower()  # Lowercasing
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenization
    tokens = [stemmer.stem(word) for word in tokens if word not in stop_words]  # Stopword removal & Stemming
    return ' '.join(tokens)

In [27]:
# Load and preprocess data
df = load_data()
df['clean_text'] = df['text'].apply(preprocess_text)

In [28]:
# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

In [29]:
# Feature extraction (TF-IDF & Bag of Words)
vectorizers = {
    "TF-IDF": TfidfVectorizer(),
    "Bag of Words": CountVectorizer()
}


In [32]:
for name, vectorizer in vectorizers.items():
    print(f"\nUsing {name}:")
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    print("Train Shape:", X_train_vec.shape)
    print("Test Shape:", X_test_vec.shape)



Using TF-IDF:
Train Shape: (1600, 28880)
Test Shape: (400, 28880)

Using Bag of Words:
Train Shape: (1600, 28880)
Test Shape: (400, 28880)


In [35]:
for name, vectorizer in vectorizers.items():
    print(f"\nUsing {name}:")

    # Transform the dataset
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)

    # Train Naïve Bayes classifier
    nb_classifier = MultinomialNB()
    nb_classifier.fit(X_train_vec, y_train)

    # Make predictions
    y_pred = nb_classifier.predict(X_test_vec)

    # Convert 0 and 1 to "Negative" and "Positive"
    sentiment_labels = {0: "Negative", 1: "Positive"}

    # Reset index to avoid KeyError and convert X_test to list
    X_test = X_test.reset_index(drop=True)
    X_test_list = X_test.tolist()  # Convert Series to List if necessary

    # Print the first 10 sentences with predictions
    print("\nSample Predictions:")
    for i in range(min(10, len(X_test_list))):  # Ensure we don't exceed dataset size
        print(f"Sentence: {X_test_list[i]}\nPredicted Sentiment: {sentiment_labels[y_pred[i]]}\n")



Using TF-IDF:

Sample Predictions:
Sentence: verdict spinechil drama horror maestro stephen king featur outstand oscarwin perform kathi bate geez french saunder field day set work parodi sorri nonbritish reader may familiar french saunder apolog pair british comedienn jennif saunder later went becom edina monsoon absolut fabul seri film spoof year back includ alien exorcist miseri needless say amidst chucklesom imperson kathi bate resembl quit uncanni dawn french got pretti nasti sledgehamm reach jennif saunder leg despit linger memori sketch although ive seen film coupl time hobbl scene less disturb im still left scream telli revuls may memor scene certainli worth watch stephen king whose film tv adapt tend vari qualiti strike gold simpl yet strikingli compel tale nice craft psycholog horror effortlessli succe draw plight writer phil sheldon jame caan rescu car accid anni wilk bate introduc writer number one fan soon becom prison script kathi bate beauti master perform oscar work arm

In [37]:
    # ✅ Evaluation Metrics
    print("\nEvaluation Metrics")
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}")

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=["Negative", "Positive"]))


Evaluation Metrics
Accuracy: 0.8100

Classification Report:
              precision    recall  f1-score   support

    Negative       0.80      0.83      0.81       199
    Positive       0.82      0.79      0.81       201

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

