### Data preprocessing for Text

In [2]:
import numpy as np
import pandas as pd
import re

### Load data

In [3]:
df=pd.read_csv("spam.csv",encoding='latin1')

In [4]:
df.columns

Index(['v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')

In [5]:
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [6]:
df.isna().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [7]:
df.shape

(5572, 5)

In [8]:
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'],inplace=True)

In [9]:
# Normalize
df.v2= df.v2.str.lower()

In [10]:
df.head()

Unnamed: 0,v1,v2
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [13]:
# replace special character
df.v2= df.v2.str.replace("[^A-Za-z]"," ")
df.head(10)

  df.v2= df.v2.str.replace("[^A-Za-z]"," ")


Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
5,spam,freemsg hey there darling it s been week s now...
6,ham,even my brother is not like to speak with me t...
7,ham,as per your request melle melle oru minnaminun...
8,spam,winner as a valued network customer you have b...
9,spam,had your mobile months or more u r entitled to...


In [14]:
# replace 's'
df.v2= df.v2.str.replace("\s+"," ")
df.head(10)

  df.v2= df.v2.str.replace("\s+"," ")


Unnamed: 0,v1,v2
0,ham,go until jurong point crazy available only in ...
1,ham,ok lar joking wif u oni
2,spam,free entry in a wkly comp to win fa cup final ...
3,ham,u dun say so early hor u c already then say
4,ham,nah i don t think he goes to usf he lives arou...
5,spam,freemsg hey there darling it s been week s now...
6,ham,even my brother is not like to speak with me t...
7,ham,as per your request melle melle oru minnaminun...
8,spam,winner as a valued network customer you have b...
9,spam,had your mobile months or more u r entitled to...


In [None]:
df.shape

# Bag of Words using CountVectorizer
- Count Vectorizer is a way to convert a given set of strings into a frequency representation
- inability in identifying more important and less important words for analysis
- consider words that are abundant in a corpus as the most statistically significant word
- doesn't identify the relationships between words such as linguistic similarity between words

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

**1. Simple unigram tokens: break into words**

In [16]:
# initiate model
cv= CountVectorizer(ngram_range=(1, 1))

In [18]:
# fit the vectorizer
cv_transform= cv.fit_transform(df.v2) # return an array
cv_transform

<5572x7682 sparse matrix of type '<class 'numpy.int64'>'
	with 71870 stored elements in Compressed Sparse Row format>

In [19]:
cv_transform= cv_transform.toarray()
cv_transform

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
cv_transform.shape #row x columns

(5572, 7682)

In [21]:
len(cv.get_feature_names()) # columns name

7682

In [22]:
# convert to dataframe
df_CountVectorizer= pd.DataFrame(cv_transform,columns=cv.get_feature_names())
df_CountVectorizer.head()

Unnamed: 0,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,abel,...,zeros,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
df_CountVectorizer =pd.concat([df.v1,df_CountVectorizer],axis=1)
df_CountVectorizer.head()

Unnamed: 0,v1,aa,aah,aaniye,aaooooright,aathi,ab,abbey,abdomen,abeg,...,zeros,zf,zhong,zindgi,zoe,zogtorius,zoom,zouk,zs,zyada
0,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,spam,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,ham,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
df_CountVectorizer

In [24]:
summary= df_CountVectorizer.iloc[:,1:].sum().reset_index()

In [25]:
summary.columns=['word','count']

In [28]:
summary.sort_values('count',ascending=False).head(5)

Unnamed: 0,word,count
7637,you,2243
6798,to,2242
6666,the,1332
241,and,979
3185,in,902


**2. Using stopwords: remove stopwords**

In [29]:
# initiate model
cv_stopwords= CountVectorizer(ngram_range=(1, 1),stop_words='english')

In [30]:
cv_stopwords_transform= cv_stopwords.fit_transform(df.v2)

In [31]:
cv_stopwords_transform= cv_stopwords_transform.toarray()

In [32]:
cv_stopwords_transform.shape # a reduce of around 200 columns

(5572, 7414)

In [33]:
# check what stopword have been reduce
for a in cv.get_feature_names(): 
    if a not in cv_stopwords.get_feature_names():
        print(a,end=",")

about,above,across,after,afterwards,again,against,all,almost,alone,along,already,also,although,always,am,among,amongst,amount,an,and,another,any,anyhow,anyone,anything,anyway,anywhere,are,around,as,at,back,be,because,become,becomes,been,before,beforehand,behind,being,beside,between,beyond,bill,both,bottom,but,by,call,can,cannot,cant,co,could,cry,de,describe,detail,do,done,down,due,during,each,eg,eight,either,eleven,else,elsewhere,empty,enough,etc,even,ever,every,everyone,everything,everywhere,except,few,fifteen,fifty,fill,find,fire,first,five,for,found,four,from,front,full,further,get,give,go,had,has,hasnt,have,he,hence,her,here,herself,him,himself,his,how,however,hundred,ie,if,in,inc,indeed,interest,into,is,it,its,itself,keep,last,least,less,ltd,made,many,may,me,meanwhile,might,mine,more,most,mostly,move,much,must,my,myself,name,neither,never,next,no,nobody,none,nor,not,nothing,now,nowhere,of,off,often,on,once,one,only,onto,or,other,others,otherwise,our,ours,out,over,own,part,per,perh

In [34]:
# convert to dataframe
cv_stopwords_transform= pd.DataFrame(cv_stopwords_transform,columns=cv_stopwords.get_feature_names())

In [35]:
cv_stopwords_transform.sum().reset_index().sort_values(0,ascending=False).head(20)

Unnamed: 0,index,0
6846,ur,385
3315,just,371
2668,gt,318
3716,lt,316
4404,ok,292
2377,free,288
3622,ll,269
3409,know,261
3569,like,245
2582,good,245


**3.Only consider certain pattern + using stopwords** 

In [36]:
# consider words with at least 3 letter
cv_stopwords_3letter= CountVectorizer(ngram_range=(1, 1),stop_words='english',token_pattern=r'\b[a-zA-Z]{4,}\b')

In [37]:
cv_stopwords_3letter_transform= cv_stopwords_3letter.fit_transform(df.v2)

In [38]:
cv_stopwords_3letter_transform= cv_stopwords_3letter_transform.toarray()

In [39]:
cv_stopwords_3letter_transform.shape # reducing around 1000 features

(5572, 6448)

In [40]:
# check what words have been deleted
for a in cv_stopwords.get_feature_names(): 
    if a not in cv_stopwords_3letter.get_feature_names():
        print(a,end=",")

aa,aah,ab,abi,abj,abt,ac,acc,acl,aco,act,ad,add,adi,adp,ads,ae,aft,ag,age,ago,ah,aha,aid,aig,ain,air,aj,ak,aka,al,ali,ami,amk,amp,ams,amt,amy,ana,ans,aom,apo,app,apr,apt,aq,ar,ard,arm,arr,art,asa,ask,asp,ass,ate,atm,av,ava,ave,avo,aww,ax,ay,ayn,ayo,ba,bac,bad,bag,bak,bam,bao,bar,bat,bay,bb,bbc,bbd,bbq,bc,bck,bcm,bcz,bec,bed,beg,ben,bet,bf,bid,big,bin,bit,biz,bk,blu,bmw,bob,boo,bot,box,boy,bp,bpo,brb,bro,bsn,bt,btw,bud,bus,buy,bw,bx,bye,cab,cal,cam,car,cat,cbe,cc,cd,cds,cer,ch,cha,chg,chk,cl,cld,cm,cme,cn,cnl,cnn,com,cos,coz,cps,cr,cro,cs,csc,csh,cst,cts,cud,cum,cup,cut,cuz,cw,cya,da,dad,dai,dan,dao,das,dat,day,db,dd,dec,def,del,dem,den,der,dey,dha,di,did,die,din,dip,dis,dl,dlf,dnt,dob,doc,dog,dom,don,dot,dps,dr,dry,dsn,dt,dub,dun,duo,dvd,dvg,dwn,dx,ea,ear,eat,ec,ed,edu,ee,eek,egg,ego,eh,el,ela,em,emc,en,enc,end,eng,epi,er,ere,erm,err,ese,eta,eva,eve,evn,evo,evr,ew,ex,exe,exp,ext,ey,eye,ez,fa,fab,fal,fan,far,fat,fav,fb,feb,fed,ff,fil,fit,fix,fl,fly,fm,fml,fne,fo,fox,fps,fr,fri,frm,fro,f

In [41]:
# conver to dataframe
cv_stopwords_3letter_transform= pd.DataFrame(cv_stopwords_3letter_transform,columns=cv_stopwords_3letter.get_feature_names())

In [42]:
cv_stopwords_3letter_transform.sum().reset_index().sort_values(0,ascending=False).head(20)

Unnamed: 0,index,0
2901,just,371
2115,free,288
2980,know,261
3111,like,245
2288,good,245
1077,come,230
5732,time,220
3221,love,209
4867,send,199
6148,want,195


**4.Consider only bigrams tokens: 2 words**

In [43]:
cv_bigrams= CountVectorizer(ngram_range=(2, 2),stop_words='english',token_pattern=r'\b[a-zA-Z]{4,}\b',min_df=2)

In [44]:
cv_bigrams_transform=cv_bigrams.fit_transform(df.v2)

In [45]:
cv_bigrams_transform= cv_bigrams_transform.toarray()

In [46]:
cv_bigrams_transform.shape

(5572, 3324)

In [47]:
cv_bigrams_transform=pd.DataFrame(cv_bigrams_transform,columns=cv_bigrams.get_feature_names())

In [48]:
cv_bigrams_transform.sum().reset_index().sort_values(0, ascending=False).head(20)

Unnamed: 0,index,0
2613,sorry later,39
1042,good morning,32
2475,send stop,22
2193,prize guaranteed,22
1898,national rate,20
2445,selected receive,19
1046,good night,19
115,await collection,19
549,customer service,19
3069,urgent mobile,18


# TF-IDF
- not only focuses on the frequency of words present in the corpus but also provides the importance of the words
- The term "tf" is basically the count of a word in a sentence
- the term "df" is called document frequency which means in how many documents the word "subfield" is present within corpus
- TFIDF is based on the logic that words that are too abundant in a corpus and words that are too rare are both not statistically important for finding a pattern.
- Higher value of tfidf signifies higher importance of the words in the corpus while lower values represent lower importance

In [49]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
tf_idf= TfidfVectorizer(stop_words='english',token_pattern=r'\b[a-zA-Z]{4,}\b')


In [51]:
tf_idf_transform= tf_idf.fit_transform(df.v2)

In [52]:
tf_idf_transform= tf_idf_transform.toarray()

In [53]:
tf_idf_transform.shape

(5572, 6448)

In [54]:
tf_idf_transform= pd.DataFrame(tf_idf_transform,columns=tf_idf.get_feature_names())

In [55]:
tf_idf_transform.sum().reset_index().sort_values(0,ascending=False).head(20)

Unnamed: 0,index,0
2901,just,84.185419
1077,come,79.292683
2980,know,67.897897
2288,good,67.392938
5732,time,66.62338
2554,home,66.128679
3111,like,65.380595
3037,later,65.344062
5174,sorry,65.033585
2115,free,62.666897
