In [19]:
import spacy

In [22]:
nlp = spacy.load("en_core_web_sm")

In [23]:
paragraph = """India runs on open source. From small startups to large enterprises, developers across the country rely on open source tools and libraries to build fast, reliable, and cost-effective systems."""

In [25]:
doc = nlp(paragraph)

In [26]:
tokens = [token.text for token in doc ]
print(tokens)

['India', 'runs', 'on', 'open', 'source', '.', 'From', 'small', 'startups', 'to', 'large', 'enterprises', ',', 'developers', 'across', 'the', 'country', 'rely', 'on', 'open', 'source', 'tools', 'and', 'libraries', 'to', 'build', 'fast', ',', 'reliable', ',', 'and', 'cost', '-', 'effective', 'systems', '.']


In [27]:
tokens_lower = [token.text.lower() for token in doc ]
tokens_lower

['india',
 'runs',
 'on',
 'open',
 'source',
 '.',
 'from',
 'small',
 'startups',
 'to',
 'large',
 'enterprises',
 ',',
 'developers',
 'across',
 'the',
 'country',
 'rely',
 'on',
 'open',
 'source',
 'tools',
 'and',
 'libraries',
 'to',
 'build',
 'fast',
 ',',
 'reliable',
 ',',
 'and',
 'cost',
 '-',
 'effective',
 'systems',
 '.']

In [32]:
tokens_stop = [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha ]
tokens_stop

['india',
 'run',
 'open',
 'source',
 'small',
 'startup',
 'large',
 'enterprise',
 'developer',
 'country',
 'rely',
 'open',
 'source',
 'tool',
 'library',
 'build',
 'fast',
 'reliable',
 'cost',
 'effective',
 'system']

In [41]:
for token in doc: 
    print(token.text, '|', token.pos_,'|',token.tag_)

India | PROPN | NNP
runs | VERB | VBZ
on | ADP | IN
open | ADJ | JJ
source | NOUN | NN
. | PUNCT | .
From | ADP | IN
small | ADJ | JJ
startups | NOUN | NNS
to | ADP | IN
large | ADJ | JJ
enterprises | NOUN | NNS
, | PUNCT | ,
developers | NOUN | NNS
across | ADP | IN
the | DET | DT
country | NOUN | NN
rely | VERB | VBP
on | ADP | IN
open | ADJ | JJ
source | NOUN | NN
tools | NOUN | NNS
and | CCONJ | CC
libraries | NOUN | NNS
to | PART | TO
build | VERB | VB
fast | ADJ | JJ
, | PUNCT | ,
reliable | ADJ | JJ
, | PUNCT | ,
and | CCONJ | CC
cost | NOUN | NN
- | PUNCT | HYPH
effective | ADJ | JJ
systems | NOUN | NNS
. | PUNCT | .


In [42]:
print("clean token :", tokens_stop)

clean token : ['india', 'run', 'open', 'source', 'small', 'startup', 'large', 'enterprise', 'developer', 'country', 'rely', 'open', 'source', 'tool', 'library', 'build', 'fast', 'reliable', 'cost', 'effective', 'system']


In [44]:
clean_doc = " ".join(tokens_stop)
final_doc = nlp(clean_doc)

In [45]:
final_doc

india run open source small startup large enterprise developer country rely open source tool library build fast reliable cost effective system

In [46]:
for token in final_doc:
    print(token.text, '|', token.pos_, '|', token.tag_)

india | PROPN | NNP
run | VERB | VBD
open | ADJ | JJ
source | NOUN | NN
small | ADJ | JJ
startup | NOUN | NN
large | ADJ | JJ
enterprise | NOUN | NN
developer | NOUN | NN
country | NOUN | NN
rely | VERB | VBP
open | ADJ | JJ
source | NOUN | NN
tool | NOUN | NN
library | NOUN | NN
build | VERB | VBP
fast | ADV | RB
reliable | ADJ | JJ
cost | NOUN | NN
effective | ADJ | JJ
system | NOUN | NN


In [51]:
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(final_doc)

TypeError: 'int' object is not callable

In [54]:
cleaned_text = " ".join([token.lemma_ for token in final_doc if token.is_alpha and not token.is_stop])
final_doc_list = [cleaned_text]

In [55]:
print(final_doc_list)

['india run open source small startup large enterprise developer country rely open source tool library build fast reliable cost effective system']


In [56]:
vectorizer = CountVectorizer()
X_counts = vectorizer.fit_transform(final_doc_list)

In [57]:
print("Bag of Words Vocabulary:", vectorizer.get_feature_names_out())
print("Bag of Words matrix:\n", X_counts.toarray())

Bag of Words Vocabulary: ['build' 'cost' 'country' 'developer' 'effective' 'enterprise' 'fast'
 'india' 'large' 'library' 'open' 'reliable' 'rely' 'run' 'small' 'source'
 'startup' 'system' 'tool']
Bag of Words matrix:
 [[1 1 1 1 1 1 1 1 1 1 2 1 1 1 1 2 1 1 1]]


In [58]:
tfidf = TfidfVectorizer()
X_tfidf = tfidf.fit_transform(final_doc_list)
print("TF-IDF Vocabulary:", tfidf.get_feature_names_out())
print("TF-IDF matrix:\n", X_tfidf.toarray().round(2))

TF-IDF Vocabulary: ['build' 'cost' 'country' 'developer' 'effective' 'enterprise' 'fast'
 'india' 'large' 'library' 'open' 'reliable' 'rely' 'run' 'small' 'source'
 'startup' 'system' 'tool']
TF-IDF matrix:
 [[0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.4 0.2 0.2 0.2 0.2 0.4 0.2 0.2
  0.2]]


In [59]:
import pandas as pd


In [60]:
df = pd.read_csv('spam.csv', encoding='latin-1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [63]:
df.column = ['v1', 'v2']
df['v1'] = df['v1'].map({'ham': 0, 'spam': 1})
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

x = df['v2']
y = df['v1']

In [68]:
df['label'] = df['label'].str.strip().map({'ham': 0, 'spam': 1})


AttributeError: Can only use .str accessor with string values!