<a href="https://colab.research.google.com/github/nishita-an/NLP-basics/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install spacy




In [None]:
import spacy

# Load English model
nlp = spacy.load("en_core_web_sm")

# Process text
doc = nlp("Apple is looking at buying U.K. startup for $1 billion.")

# Tokenization
tokens = [token.text for token in doc]

print(tokens)





['Apple', 'is', 'looking', 'at', 'buying', 'U.K.', 'startup', 'for', '$', '1', 'billion', '.']


In [None]:
# Named Entity Recognition (NER)
entities = [(ent.text, ent.label_) for ent in doc.ents]

print(entities)


[('Apple', 'ORG'), ('U.K.', 'GPE'), ('$1 billion', 'MONEY')]


In [None]:
# Part-of-Speech (POS) Tagging
pos_tags = [(token.text, token.pos_) for token in doc]

print(pos_tags)

[('Apple', 'PROPN'), ('is', 'AUX'), ('looking', 'VERB'), ('at', 'ADP'), ('buying', 'VERB'), ('U.K.', 'PROPN'), ('startup', 'NOUN'), ('for', 'ADP'), ('$', 'SYM'), ('1', 'NUM'), ('billion', 'NUM'), ('.', 'PUNCT')]


In [None]:
!pip install nltk



In [None]:
 import nltk
 nltk.download('punkt_tab')
 nltk.download('stopwords')
 nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

Tokenization


In [None]:
from nltk.tokenize import word_tokenize
text = "Natural Language Processing is fun!"
tokens = word_tokenize(text)
print(tokens)


['Natural', 'Language', 'Processing', 'is', 'fun', '!']


Stop words

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
print(filtered_tokens)

['Natural', 'Language', 'Processing', 'fun', '!']


Stemming

In [None]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in filtered_tokens]
print(stemmed_words)


['natur', 'languag', 'process', 'fun', '!']


POS Tagging

In [None]:
from nltk import pos_tag
pos_tags = pos_tag(tokens)
print(pos_tags)

[('Natural', 'JJ'), ('Language', 'NNP'), ('Processing', 'NNP'), ('is', 'VBZ'), ('fun', 'NN'), ('!', '.')]


In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
text = "I love this product! It's amazing."
result = classifier(text)
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu


[{'label': 'POSITIVE', 'score': 0.9998866319656372}]


In [None]:
from transformers import pipeline
classifier = pipeline("sentiment-analysis")
text = "I hate the book."
result = classifier(text)
print(result)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9996179342269897}]


Multi ligual

In [None]:
!python -m spacy download xx_ent_wiki_sm


Collecting xx-ent-wiki-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/xx_ent_wiki_sm-3.7.0/xx_ent_wiki_sm-3.7.0-py3-none-any.whl (11.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m44.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xx-ent-wiki-sm
Successfully installed xx-ent-wiki-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('xx_ent_wiki_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import spacy
nlp = spacy.load("xx_ent_wiki_sm")  # Multilingual model
doc = nlp("Bonjour! Comment ça va?")
print([(token.text, token.lang_) for token in doc])




[('Bonjour', 'xx'), ('!', 'xx'), ('Comment', 'xx'), ('ça', 'xx'), ('va', 'xx'), ('?', 'xx')]


N-grams


In [None]:
import random
from collections import defaultdict, Counter

class NGramLanguageModel:
    def __init__(self, n):
        self.n = n
        self.ngram_counts = defaultdict(Counter)
        self.context_counts = defaultdict(int)

    def train(self, corpus):
        """Train the n-gram model with a given corpus (list of sentences)."""
        for sentence in corpus:
            words = ["<s>"] * (self.n - 1) + sentence.split() + ["</s>"]
            for i in range(len(words) - self.n + 1):
                ngram = tuple(words[i:i + self.n])
                context = ngram[:-1]
                word = ngram[-1]
                self.ngram_counts[context][word] += 1
                self.context_counts[context] += 1

    def predict_next_word(self, context):
        """Predict the next word given a context (tuple of n-1 words)."""
        if context not in self.ngram_counts:
            return "</s>"  # Return end-of-sentence token if context is unknown
        possible_words = self.ngram_counts[context]
        total_count = sum(possible_words.values())
        return random.choices(list(possible_words.keys()), weights=possible_words.values(), k=1)[0]

    def generate_sentence(self, max_length=20):
        """Generate a random sentence using the trained n-gram model."""
        context = ("<s>",) * (self.n - 1)
        sentence = []
        for _ in range(max_length):
            next_word = self.predict_next_word(context)
            if next_word == "</s>":
                break
            sentence.append(next_word)
            context = (*context[1:], next_word)
        return " ".join(sentence)

# Example usage:
corpus = [
    "the cat sat on the mat",
    "the dog barked at the cat",
    "the cat chased the mouse",
    "the dog sat on the rug"
]

model = NGramLanguageModel(n=2)  # Bigram model
model.train(corpus)

# Generate a sentence
generated_sentence = model.generate_sentence()
print("Generated sentence:", generated_sentence)


Generated sentence: the cat sat on the cat


Fetching the email address with regular expressions

In [None]:
import re

def extract_emails(text):
    pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    return re.findall(pattern, text)

print(extract_emails("Contact us at info@example.com or support@site.org"))


['info@example.com', 'support@site.org']


Cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def cosine_sim(doc1, doc2):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform([doc1, doc2])
    return cosine_similarity(vectors[0], vectors[1])[0][0]

print(cosine_sim("I love NLP.", "NLP is amazing!"))


0.2605556710562624


In [None]:
print(cosine_sim("I love NLP.", "NLP is my love!"))

0.5797386715376658


In [None]:
print(cosine_sim("I love NLP.", "NLP is lovely!"))

0.2605556710562624


Sentimental analysis using BOW and scikit learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Expanded dataset
data = [
    ("I love this product", "positive"),
    ("This is an amazing movie", "positive"),
    ("I feel great about this!", "positive"),
    ("I hate this movie", "negative"),
    ("This is the worst product ever", "negative"),
    ("I am not happy with this", "negative"),
    ("This product is wonderful", "positive"),
    ("The movie was fantastic", "positive"),
    ("The product broke after a week", "negative"),
    ("I would never buy this again", "negative"),
    ("This is a great phone", "positive"),
    ("I cannot stand this product", "negative"),
    ("Best purchase I ever made", "positive"),
    ("Worst movie ever", "negative"),
    ("Such a bad experience", "negative"),
    ("I am so happy with this purchase", "positive"),
]

# Split data into text and labels
texts, labels = zip(*data)

# Step 1: Convert text data into numeric features using TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X = vectorizer.fit_transform(texts)

# Step 2: Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.33, random_state=42)

# Step 3: Train a Naive Bayes classifier (MultinomialNB)
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

# Step 4: Make predictions
y_pred = classifier.predict(X_test)

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Step 6: Test on a new sample
sample_text = ["This product is fantastic!"]
sample_vectorized = vectorizer.transform(sample_text)
prediction = classifier.predict(sample_vectorized)
print(f"Prediction for '{sample_text[0]}': {prediction[0]}")


Accuracy: 50.00%
Prediction for 'This product is fantastic!': positive


In [None]:
sample_text = ["This product is fantastic,but my friend is not happy with it"]
sample_vectorized = vectorizer.transform(sample_text)
prediction = classifier.predict(sample_vectorized)
print(f"Prediction for '{sample_text[0]}': {prediction[0]}")

Prediction for 'This product is fantastic,but my friend is not happy with it': positive


Embedding of the sentences

In [None]:
pip install sentence-transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
from sentence_transformers import SentenceTransformer, util

# Load the embedding model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Define sentences
sentence1 = "Hi how are you doing?"
sentence2 = "Hello I'm fine"

# Get embeddings
embedding1 = model.encode(sentence1, convert_to_tensor=True)
embedding2 = model.encode(sentence2, convert_to_tensor=True)

# Compute cosine similarity
similarity = util.pytorch_cos_sim(embedding1, embedding2)

# Print results
print("Embedding for Sentence 1:", embedding1)
print("Embedding for Sentence 2:", embedding2)
print(f"Cosine Similarity: {similarity.item():.4f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling%2Fconfig.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding for Sentence 1: tensor([-7.8131e-03,  2.3613e-02,  6.0112e-02,  6.3286e-02, -2.4763e-02,
        -5.0248e-02,  6.5444e-02, -8.1628e-03, -5.7389e-02,  1.3779e-02,
        -5.4936e-02,  6.5671e-03, -3.0208e-02, -6.0082e-03,  4.7923e-02,
        -6.3706e-03,  3.0688e-02, -6.7537e-02, -1.1745e-01,  3.6270e-03,
        -1.7806e-02, -1.0759e-02,  2.1873e-02,  7.5496e-02, -1.5065e-02,
         4.5704e-03,  2.8684e-03,  3.5725e-02,  1.1380e-02, -7.7622e-02,
        -3.9394e-02,  3.1968e-02, -2.2039e-02, -7.8054e-03, -2.8224e-02,
         4.9844e-02, -1.6806e-02, -1.5534e-01,  2.2300e-02, -1.8420e-02,
         2.8157e-02, -4.7300e-02, -1.5212e-02,  1.4286e-03,  9.3600e-02,
        -6.7231e-02,  4.4301e-02,  3.0689e-02,  1.2586e-01,  1.8397e-02,
        -1.1343e-01, -2.6016e-02,  8.3442e-04,  3.5665e-02,  6.2278e-02,
         4.2765e-02, -5.5186e-02, -4.1812e-02,  3.7567e-02, -4.9586e-02,
         3.9685e-03,  3.8407e-02, -6.3546e-02,  5.6174e-03,  2.6568e-02,
        -2.3607e-02, -6.5