In [5]:
import pandas as pd


# Load the data
messages = pd.read_csv('spam2.csv', sep='\t', names=["message"], encoding='latin1')


In [6]:
messages

Unnamed: 0,message
0,"v1,v2,,,"
1,"ham,""Go until jurong point, crazy.. Available ..."
2,"ham,Ok lar... Joking wif u oni...,,,"
3,"spam,Free entry in 2 a wkly comp to win FA Cup..."
4,"ham,U dun say so early hor... U c already then..."
...,...
5570,"spam,""This is the 2nd time we have tried 2 con..."
5571,"ham,Will Ì_ b going to esplanade fr home?,,,"
5572,"ham,""Pity, * was in mood for that. So...any ot..."
5573,"ham,The guy did some bitching but I acted like..."


In [7]:
## Data Cleaning And Preprocessing
import re
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nagar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordlemmatize=WordNetLemmatizer()

In [9]:
corpus=[]
for i in range(0,len(messages)):
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordlemmatize.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
tfidf=TfidfVectorizer(max_features=100)
X=tfidf.fit_transform(corpus).toarray()

In [12]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, 
    formatter=dict(float=lambda x: "%.3g" % x))

In [13]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.431, 0, 0, 0.458, 0.54, 0, 0.125, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.546, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.276, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.433, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0.313, 0, 0, 0, 0, 0.449, 0, 0, 0, 0, 0, 0, 0, 0.467, 0, 0, 0, 0, 0, 0, 0, 0.542, 0, 0, 0, 0, 0],
       [0.462, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.103, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
###### 
#   N - Grams

In [14]:
tfidf=TfidfVectorizer(max_features=100,ngram_range=(2,2))
X=tfidf.fit_transform(corpus).toarray()

In [15]:
tfidf.vocabulary_

{'ham go': np.int64(32),
 'ham ok': np.int64(49),
 'spam free': np.int64(92),
 'claim call': np.int64(8),
 'call claim': np.int64(2),
 'free call': np.int64(13),
 'chance win': np.int64(7),
 'spam urgent': np.int64(94),
 'ham oh': np.int64(48),
 'ham fine': np.int64(30),
 'ham going': np.int64(33),
 'ham lol': np.int64(44),
 'let know': np.int64(77),
 'ham yeah': np.int64(69),
 'ham tell': np.int64(61),
 'ham yup': np.int64(72),
 'ham see': np.int64(54),
 'ham hello': np.int64(38),
 'ham pls': np.int64(51),
 'please call': np.int64(83),
 'ham great': np.int64(36),
 'lt gt': np.int64(79),
 'ham call': np.int64(23),
 'ham get': np.int64(31),
 'ham know': np.int64(43),
 'ham sorry': np.int64(57),
 'sorry call': np.int64(91),
 'call later': np.int64(6),
 'ham yes': np.int64(70),
 'ham hi': np.int64(40),
 'ham still': np.int64(58),
 'hi hi': np.int64(74),
 'ham really': np.int64(52),
 'customer service': np.int64(10),
 'po box': np.int64(85),
 'ham please': np.int64(50),
 'ham thanks': np.i

In [16]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0

In [17]:
print("Tq")

Tq
