In [1]:
#Natural Language Processing with NLTK, SpaCy, Word2Vec, and TF-IDF
#Objective: In this assignment, you will learn how to perform Natural Language Processing tasks using Python libraries such as NLTK, SpaCy, Word2Vec, and TF-IDF. You will practice techniques such as tokenization, stemming, lemmatization, and document similarity calculation using these libraries. You will use the BBC News dataset available on Kaggle.
#Tasks:
#1. Import the necessary libraries: Start by importing the required libraries, including NLTK, SpaCy, gensim, and scikit-learn.
#2. Load the dataset: Load the BBC News dataset(BBC_DATA.csv) into a pandas DataFrame using the read_csv() function. The dataset contains 2,225 rows and 2 columns, with the first column containing the text of the news articles.
#3. Tokenization with NLTK: Implement tokenization using NLTK's word_tokenize() and sent_tokenize() functions. Apply these functions to a sample news article from the dataset.
#4. Stemming and Lemmatization with NLTK: Implement stemming and lemmatization using NLTK's PorterStemmer and WordNetLemmatizer functions. Apply these functions to a sample news article from the dataset.
#5. Named Entity Recognition with SpaCy: Use SpaCy's pre-trained model to perform named entity recognition on a sample news article from the dataset. Visualize the named entities using displaCy.
#6. Word2Vec with gensim: Implement Word2Vec using gensim's Word2Vec function on the entire dataset. Train the model and get the vector representation of a sample word.
#7. TF-IDF with scikit-learn: Implement TF-IDF using scikit-learn's TfidfVectorizer function on the entire dataset. Transform the dataset using the fitted vectorizer and calculate the cosine similarity between two news articles.

In [2]:
#Import the necessary libraries: Start by importing the required libraries, including NLTK, SpaCy, gensim, and scikit-learn.

In [3]:
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import spacy
from spacy import displacy
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
#Load the dataset: Load the BBC News dataset(BBC_DATA.csv)

In [5]:
df =pd.read_csv("BBC_DATA.csv")

In [6]:
df

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [7]:
#Tokenization with NLTK: Implement tokenization using NLTK's word_tokenize() and sent_tokenize() functions. Apply these functions to a sample news article from the dataset.

In [8]:
sample_article = df["Text"].iloc[0]
tokens = word_tokenize(sample_article)
sentences = sent_tokenize(sample_article)

In [9]:
sentences

['worldcom ex-boss launches defence lawyers defending former worldcom chief bernie ebbers against a battery of fraud charges have called a company whistleblower as their first witness.',
 'mr ebbers has pleaded not guilty to charges of fraud and conspiracy.',
 'prosecution lawyers have argued that mr ebbers orchestrated a series of accounting tricks at worldcom  ordering employees to hide expenses and inflate revenues to meet wall street earnings estimates.',
 'but ms cooper  who now runs her own consulting business  told a jury in new york on wednesday that external auditors arthur andersen had approved worldcom s accounting in early 2001 and 2002. she said andersen had given a  green light  to the procedures and practices used by worldcom.',
 'mr ebber s lawyers have said he was unaware of the fraud  arguing that auditors did not alert him to any problems.',
 'ms cooper also said that during shareholder meetings mr ebbers often passed over technical questions to the company s finance

In [10]:
#Stemming and Lemmatization with NLTK: Implement stemming and lemmatization using NLTK's PorterStemmer and WordNetLemmatizer functions. Apply these functions to a sample news article from the dataset.

In [11]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [12]:
stemmed_words = [stemmer.stem(word) for word in tokens]
stemmed_sents = [stemmer.stem(sent) for sent in sentences]

In [13]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mithilkatkar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mithilkatkar/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in tokens]
lemmatized_sents = [lemmatizer.lemmatize(sent) for sent in sentences]

In [16]:
#Named Entity Recognition with SpaCy: Use SpaCy's pre-trained model to perform named entity recognition on a sample news article from the dataset. Visualize the named entities using displaCy.

In [17]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(sample_article)
displacy.render(doc, style = "ent", jupyter = True)

In [18]:
#Word2Vec with gensim: Implement Word2Vec using gensim's Word2Vec function on the entire dataset. Train the model and get the vector representation of a sample word.

In [19]:
sentences = [word_tokenize(article) for article in df["Text"]]
model = Word2Vec(sentences, vector_size = 100, window = 5, min_count = 1, sg = 0)

In [20]:
sample_word = "example"
vector = model.wv[sample_word]

In [21]:
vector

array([-0.3316333 ,  0.25226447,  0.07201868,  0.1148843 , -0.14090411,
       -0.7096622 ,  0.15487424,  0.60415477, -0.18694656, -0.3533997 ,
       -0.05384324, -0.25216907, -0.15324293,  0.09913897,  0.11254311,
       -0.1686951 ,  0.19000964, -0.39049652, -0.26436657, -0.5225271 ,
        0.29189017, -0.05866744,  0.3447942 , -0.25948232, -0.05789165,
        0.02609388, -0.24864127,  0.00295673, -0.3011712 ,  0.06883724,
        0.40149847, -0.08262256,  0.30749995, -0.46056345, -0.159174  ,
        0.43351027,  0.19000438, -0.01564637, -0.17157827, -0.36175594,
       -0.02863765, -0.4259641 , -0.15508172, -0.089826  ,  0.30286372,
       -0.14766453, -0.3774476 ,  0.00304294,  0.15840791,  0.20998606,
        0.30018795, -0.44000307, -0.04516641, -0.11745839, -0.06962018,
        0.20081423,  0.3487805 ,  0.00761655, -0.2908488 ,  0.03304713,
        0.2264801 ,  0.00533899, -0.07253095, -0.10413266, -0.3293481 ,
        0.31042853, -0.1825267 ,  0.30042624, -0.26368874,  0.34

In [22]:
#TF-IDF with scikit-learn: Implement TF-IDF using scikit-learn's TfidfVectorizer function on the entire dataset. Transform the dataset using the fitted vectorizer and calculate the cosine similarity between two news articles.

In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [24]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df["Text"])
article1 = 0
article2 = 1
cosine_sim = cosine_similarity(tfidf_matrix[article1], tfidf_matrix[article2])

In [25]:
cosine_sim[0][0]

0.07875931547482325

In [26]:
#Bonus: Choose a different dataset and perform NLP tasks using different techniques. Be creative!
#Sentiment Analysis with NLTK:

In [27]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mithilkatkar/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [28]:
review = "The movie was fantastic! I loved it."

In [29]:
sia = SentimentIntensityAnalyzer()

In [30]:
sentiment_scores = sia.polarity_scores(review)

In [31]:
sentiment = 'positive' if sentiment_scores['compound'] > 0 else 'negative'
print(f"Sentiment: {sentiment}")

Sentiment: positive


In [32]:
rev = "The movie is boring i didn't like it"

In [33]:
sentiment_score = sia.polarity_scores(rev)

In [34]:
sentiment = 'positive' if sentiment_score['compound'] > 0 else 'negative'
print(f"Sentiment: {sentiment}")

Sentiment: negative
