Natural Language Processing (NLP) is a branch of artificial intelligence (AI) that focuses on
the interaction between computers and human (natural) languages. It enables machines to read,
understand, interpret, and generate human language in a valuable and meaningful way.

🔍 Key Areas of NLP:
1.Text Processing:

-->Tokenization

-->Lemmatization/Stemming

-->Stopword Removal

2.Syntactic Analysis:

-->Part-of-Speech (POS) Tagging

-->Parsing (Dependency & Constituency)

3.Semantic Analysis:

-->Named Entity Recognition (NER)

-->Word Sense Disambiguation

-->Sentiment Analysis

4.Discourse & Pragmatics:

-->Co-reference Resolution

-->Discourse Structure

5.Applications:

-->Machine Translation (e.g., Google Translate)

-->Chatbots and Virtual Assistants (e.g., Siri, Alexa, ChatGPT)

-->Text Summarization

-->Text Classification (e.g., spam detection)

-->Question Answering

-->Language Modeling (e.g., GPT, BERT)



In [3]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


True

In [4]:
document = """It was a very pleasant day. The weather was cool and there were light showers. 
I went to the market to buy some fruits."""

sentence = "Send all the 50 documents  related to chapters 1,2,3 at shubham@cb.com"

In [5]:
from nltk.tokenize import sent_tokenize,word_tokenize

In [None]:
sents = sent_tokenize(document)
print(sents)

['It was a very pleasant day.', 'The weather was cool and there were light showers.', 'I went to the market to buy some fruits.']


In [7]:
words = word_tokenize(document)
print(words)

['It', 'was', 'a', 'very', 'pleasant', 'day', '.', 'The', 'weather', 'was', 'cool', 'and', 'there', 'were', 'light', 'showers', '.', 'I', 'went', 'to', 'the', 'market', 'to', 'buy', 'some', 'fruits', '.']


In [None]:
import re #re-regular expression

In [9]:
re.split(r'[,\s]+',sentence)

['Send',
 'all',
 'the',
 '50',
 'documents',
 'related',
 'to',
 'chapters',
 '1',
 '2',
 '3',
 'at',
 'shubham@cb.com']

In [10]:
re.sub(r'[^a-zA-Z\s@.]+','',sentence)

'Send all the  documents  related to chapters  at shubham@cb.com'

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords

sw = set(stopwords.words('english'))

In [14]:
print(sw)

{'if', 'why', 'will', "needn't", 'out', 've', 'mightn', 'whom', 'hadn', 'any', 'between', 'its', "she'd", 'haven', 'such', 'into', 'or', 'that', "it's", 'him', "don't", 'during', 'yourselves', "we'd", 'here', 'shan', 'from', 'more', 'won', 'how', 'couldn', 'don', "i'm", "didn't", 'over', "they're", 'ourselves', 'do', "she's", "weren't", 'herself', 'been', 'wouldn', 'until', "she'll", 'myself', "haven't", 'their', "doesn't", 'for', 'itself', 'her', 'theirs', 'isn', 'once', 'we', 'all', 'she', 'some', "mightn't", 'didn', 'when', 'those', 'm', 'each', "won't", 'himself', "i'd", 's', 'below', "you're", 'not', 'weren', 'd', "it'd", "you'll", 'aren', 'what', 'as', 'own', 'few', 'our', 't', 'in', 'were', 'while', 'other', 'me', 'has', 'very', 'of', 'my', 'then', 'hasn', "they'd", 'before', 'doing', 'down', "we've", 'only', "wasn't", 'had', 'further', 'i', 'again', 'ma', 'on', 'mustn', "should've", 'against', 'than', 'who', "hasn't", 'was', 'because', 'too', 'and', 'it', 'by', 'is', 'now', 'mo

In [15]:
sentence = re.sub(r'[^a-zA-Z\s@.]+','',sentence)
tokens = re.split(r'[,\s]+',sentence)
tokens = [t for t in tokens if t not in sw]
tokens

['Send', 'documents', 'related', 'chapters', 'shubham@cb.com']

In [16]:
from nltk.stem.snowball import SnowballStemmer, PorterStemmer
from nltk.stem.lancaster import LancasterStemmer

In [17]:
sst = SnowballStemmer('english')
sst.stem('deafeated')

'deafeat'

In [18]:
sst.stem('curl')

'curl'

In [19]:
sst.stem('ran')

'ran'

In [20]:
ps = PorterStemmer()

In [21]:
ps.stem('curl')

'curl'

In [22]:
ps.stem('curly')

'curli'

In [23]:
sentence = re.sub(r'[^a-zA-Z\s@.]+','',sentence)
tokens = re.split(r'[,\s]+',sentence)
tokens = [sst.stem(t) for t in tokens if t not in sw]
tokens

['send', 'document', 'relat', 'chapter', 'shubham@cb.com']

In [25]:
import nltk
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [26]:
nltk.pos_tag_sents([['I', 'made' ,'10', 'runs']])

[[('I', 'PRP'), ('made', 'VBD'), ('10', 'CD'), ('runs', 'NNS')]]

In [28]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\asus\AppData\Roaming\nltk_data...


True

In [29]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

# Example words
words = ["ran", "flies", "better", "geese"]

# Lemmatize words
lemmatized_words = [lemmatizer.lemmatize(word, pos=wordnet.VERB) for word in words]
print("Lemmatized Words:", lemmatized_words)

Lemmatized Words: ['run', 'fly', 'better', 'geese']


In [30]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import pandas as pd

In [31]:
def preprocess_txt(sent):
    sent = sent.lower()
    sent = re.sub(r'[^a-zA-Z\s]+','',sent)
    words = re.split(r'[\s]+',sent)
    words = [sst.stem(w) for w in words if w not in sw]
    return ' '.join(words)

In [32]:
sents = ['I like to eat Pizza','I donot like to watch movies']
for i in range(len(sents)):
    sents[i] =  preprocess_txt(sents[i])
print(sents)
cv = CountVectorizer(ngram_range=(1,2))
vec = cv.fit_transform(sents).toarray()
print(vec)

['like eat pizza', 'donot like watch movi']
[[0 0 1 1 1 1 0 0 1 0 0]
 [1 1 0 0 1 0 1 1 0 1 1]]


In [33]:
cv.get_feature_names_out()

array(['donot', 'donot like', 'eat', 'eat pizza', 'like', 'like eat',
       'like watch', 'movi', 'pizza', 'watch', 'watch movi'], dtype=object)

In [35]:
data = pd.read_csv('emails.csv')

In [36]:
data


Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1
...,...,...
5723,Subject: re : research and development charges...,0
5724,"Subject: re : receipts from visit jim , than...",0
5725,Subject: re : enron case study update wow ! a...,0
5726,"Subject: re : interest david , please , call...",0


In [37]:
def process_text(text):
    text = text.lower()[9:]
    sentence = re.sub(r'[^a-zA-Z\s]+','',text)
    tokens = re.split(r'[,\s]+',sentence)
    tokens = [sst.stem(w) for w in tokens if w not in sw]

    return ' '.join(tokens)

In [38]:
data['text'] = data['text'].apply(process_text)

In [39]:
data['text']

0       natur irresist corpor ident lt realli hard rec...
1       stock trade gunsling fanni merril muzo colza a...
2       unbeliev new home made easi im want show homeo...
3        color print special request addit inform clic...
4       money get softwar cds softwar compat great gro...
                              ...                        
5723    research develop charg gpg forward shirley cre...
5724    receipt visit jim thank invit visit lsu shirle...
5725    enron case studi updat wow day super thank muc...
5726    interest david pleas call shirley crenshaw ass...
5727    news aurora updat aurora version fastest model...
Name: text, Length: 5728, dtype: object

In [40]:
X = data['text'].values
y = data['spam'].values

In [41]:
cv = CountVectorizer(ngram_range=(1,2))

In [42]:
x_data = cv.fit_transform(X)

In [43]:
x_data.shape

(5728, 323076)

In [44]:
x_data

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1141453 stored elements and shape (5728, 323076)>

In [45]:
print(x_data[0])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 204 stored elements and shape (1, 323076)>
  Coords	Values
  (0, 187281)	1
  (0, 145101)	1
  (0, 59931)	1
  (0, 133560)	1
  (0, 165352)	1
  (0, 233151)	1
  (0, 124873)	1
  (0, 234592)	1
  (0, 51213)	3
  (0, 170898)	4
  (0, 112752)	1
  (0, 279224)	1
  (0, 138840)	1
  (0, 145334)	1
  (0, 119468)	2
  (0, 39196)	1
  (0, 163162)	4
  (0, 275541)	1
  (0, 271722)	1
  (0, 202274)	1
  (0, 311776)	2
  (0, 168017)	2
  (0, 282787)	1
  (0, 185039)	2
  (0, 82723)	1
  :	:
  (0, 80683)	1
  (0, 316129)	1
  (0, 288773)	1
  (0, 33842)	1
  (0, 67907)	1
  (0, 5756)	1
  (0, 170990)	1
  (0, 31495)	1
  (0, 168193)	1
  (0, 114531)	1
  (0, 33003)	1
  (0, 250525)	1
  (0, 123070)	1
  (0, 227310)	1
  (0, 300194)	1
  (0, 11023)	1
  (0, 41807)	1
  (0, 100365)	1
  (0, 103419)	1
  (0, 279498)	1
  (0, 165001)	1
  (0, 243157)	1
  (0, 47456)	1
  (0, 164104)	1
  (0, 215727)	1


In [46]:
from sklearn.linear_model import LogisticRegression

In [47]:
clf = LogisticRegression()
clf.fit(x_data,y)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [48]:
clf.score(x_data,y)


1.0