In [1]:
import pandas as pd

data = {
    "label": ["ham", "spam", "ham", "spam", "ham"],
    "message": [
        "Hey are we meeting today",
        "Win cash now",
        "Call me when you are free",
        "Free entry in contest text WIN",
        "Lets have dinner tonight"
    ]
}

df = pd.DataFrame(data)
df.to_csv("spam_data.csv", index=False)

print("CSV file created!")


CSV file created!


In [2]:
messages = pd.read_csv("spam_data.csv")
print(messages)


  label                         message
0   ham        Hey are we meeting today
1  spam                    Win cash now
2   ham       Call me when you are free
3  spam  Free entry in contest text WIN
4   ham        Lets have dinner tonight


In [3]:
## Data Cleaning And Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nikki\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [5]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [6]:
corpus

['hey meet today',
 'win cash',
 'call free',
 'free entri contest text win',
 'let dinner tonight']

## Create Bag Of Words

scikit-learn = Python ki ML library

ML ke liye ready-made tools deta hai

In [7]:
## Create the Bag OF Words model
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True)

In [8]:
X=cv.fit_transform(corpus).toarray()

In [9]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))
X

array([[0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0]])

### N-Grams

N-grams convert text into binary vectors using word combinations, allowing models to capture context such as negation (not good) which unigram Bag of Words fails to capture.

In [10]:
cv.vocabulary_

{'hey': np.int64(6),
 'meet': np.int64(8),
 'today': np.int64(10),
 'win': np.int64(12),
 'cash': np.int64(1),
 'call': np.int64(0),
 'free': np.int64(5),
 'entri': np.int64(4),
 'contest': np.int64(2),
 'text': np.int64(9),
 'let': np.int64(7),
 'dinner': np.int64(3),
 'tonight': np.int64(11)}

In [11]:
## Create the Bag OF Words model with ngram
from sklearn.feature_extraction.text import CountVectorizer
## for Binary BOW enable binary=True
cv=CountVectorizer(max_features=100,binary=True,ngram_range=(2,3))
X=cv.fit_transform(corpus).toarray()

In [12]:
cv.vocabulary_

{'hey meet': np.int64(8),
 'meet today': np.int64(12),
 'hey meet today': np.int64(9),
 'win cash': np.int64(14),
 'call free': np.int64(0),
 'free entri': np.int64(6),
 'entri contest': np.int64(4),
 'contest text': np.int64(1),
 'text win': np.int64(13),
 'free entri contest': np.int64(7),
 'entri contest text': np.int64(5),
 'contest text win': np.int64(2),
 'let dinner': np.int64(10),
 'dinner tonight': np.int64(3),
 'let dinner tonight': np.int64(11)}

In [13]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0]])

## Create TF-IDF And NGrams

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [16]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [17]:
X

array([[0, 0, 0, 0, 0, 0, 0.577, 0, 0.577, 0, 0.577, 0, 0],
       [0, 0.778, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.628],
       [0.778, 0, 0, 0, 0, 0.628, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0.482, 0, 0.482, 0.389, 0, 0, 0, 0.482, 0, 0, 0.389],
       [0, 0, 0, 0.577, 0, 0, 0, 0.577, 0, 0, 0, 0.577, 0]])

TF-IDF captures word importance but not word relationships.
N-grams are required to capture contextual meaning like negation.
Hence, TF-IDF with N-grams is preferred in sentiment analysis.