<p style="color:#153462; 
          font-weight: bold; 
          font-size: 30px; 
          font-family: Gill Sans, sans-serif; 
          text-align: center;">
          N-Grams</p>

<p style="text-align: justify; text-justify: inter-word;">
   <font size=3>
       Creates a document-term matrix where count still occupy the cell but instead of the columns representing single term they will represent all combinations of adjacent words at length n in your text. <br>
       <b>Example:</b> NLP is an intresting topic
       <table>
           <tr>
               <th>n</th>
               <th>Name</th>
               <th>Tokens</th>
           </tr>
           <tr>
               <td>2</td>
               <td>Bigram</td>
               <td>["NLP is", "is an", "an intresting", "intresting topic"]</td>
           </tr>
           <tr>
               <td>3</td>
               <td>Trigram</td>
               <td>["NLP is an", "is an intresting", "an intresting topic"]</td>
           </tr>
           <tr>
               <td>4</td>
               <td>four-gram</td>
               <td>["NLP is an intresting", "is an intresting topic"]</td>
           </tr>
       </table>
       
   </font>
</p>


### Importing Packages

In [1]:
import pandas as pd
import string
import nltk
import re
from sklearn.feature_extraction.text import CountVectorizer

### Loading Data

In [8]:
ps = nltk.PorterStemmer()

nltk.data.path.append(r"D:\Artificial_Intelligence\nat_lang_proc\nltk_data")
stopwords = nltk.corpus.stopwords.words("English")

data_df = pd.read_csv(r"D:/Artificial_Intelligence/nat_lang_proc/data/SMSSpamCollection.tsv", delimiter="\t", header=None)
data_df.columns = ["labels", "body_text"]
data_df.head()

Unnamed: 0,labels,body_text
0,ham,I've been searching for the right words to tha...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...
2,ham,"Nah I don't think he goes to usf, he lives aro..."
3,ham,Even my brother is not like to speak with me. ...
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!


### Data Cleaning

In [9]:
def clean_data(text):
    without_punc = "".join([char.lower() for char in text if char not in string.punctuation])
    tokenzied_text = re.findall("\w+", without_punc)
    # NOTE: N-gram need a sentence instead of token of words
    stemmed_text = " ".join([ps.stem(word) for word in tokenzied_text if word not in stopwords])
    return stemmed_text

In [10]:
data_df["clean_text"] = data_df["body_text"].apply(clean_data)

In [11]:
data_df.head()

Unnamed: 0,labels,body_text,clean_text
0,ham,I've been searching for the right words to tha...,ive search right word thank breather promis wo...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday


### N-gram Implementation

In [13]:
# ngram_range parameter takes range ngram to consider. Here we are telling to consider only Bigram
# if you space range range from (1, 3) then it will consider unigram, bigram and trigram
ngram_vect = CountVectorizer(ngram_range=(2, 2))
x_counts = ngram_vect.fit_transform(data_df["clean_text"])
print(x_counts.shape)
print(ngram_vect.get_feature_names_out())

(5568, 31275)
['008704050406 sp' '0089mi last' '0121 2025050' ... 'üll submit'
 'üll take' '〨ud even']


#### N-gram on smaller data

In [15]:
data_sample = data_df[:20]
ngram_vect_sample = CountVectorizer(ngram_range=(2, 2))
x_counts_sample = ngram_vect_sample.fit_transform(data_sample["clean_text"])
print(x_counts_sample.shape)
print(ngram_vect_sample.get_feature_names_out())

(20, 209)
['09061701461 claim' '100 20000' '100000 prize' '11 month' '12 hour'
 '150pday 6day' '16 tsandc' '20000 pound' '2005 text' '21st may'
 '4txtú120 poboxox36504w45wq' '6day 16' '81010 tc' '87077 eg'
 '87077 trywal' '87121 receiv' '87575 cost' '900 prize' 'aft finish'
 'aid patent' 'anymor tonight' 'appli 08452810075over18' 'appli repli'
 'ard smth' 'around though' 'bless time' 'breather promis' 'brother like'
 'call 09061701461' 'call mobil' 'caller press' 'callertun caller'
 'camera free' 'cash 100' 'chanc win' 'claim 81010' 'claim call'
 'claim code' 'click httpwap' 'click wap' 'co free' 'code kl341'
 'colour mobil' 'comp win' 'copi friend' 'cost 150pday' 'credit click'
 'cri enough' 'csh11 send' 'cup final' 'custom select' 'da stock'
 'date sunday' 'dont miss' 'dont think' 'dont want' 'eg england'
 'eh rememb' 'england 87077' 'england macedonia' 'enough today'
 'entitl updat' 'entri questionstd' 'entri wkli' 'even brother' 'fa 87121'
 'fa cup' 'feel way' 'final tkt' 'fine way

In [17]:
df = pd.DataFrame(x_counts_sample.toarray())
df.columns = ngram_vect_sample.get_feature_names_out()
df.head()

Unnamed: 0,09061701461 claim,100 20000,100000 prize,11 month,12 hour,150pday 6day,16 tsandc,20000 pound,2005 text,21st may,...,win fa,winner valu,wkli comp,wonder bless,wont take,word claim,word thank,wwwdbuknet lccltd,xxxmobilemovieclub use,ye naughti
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,1,0,1,0,0,0
1,0,0,0,0,0,0,0,0,1,1,...,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
data_sample[:5]

Unnamed: 0,labels,body_text,clean_text
0,ham,I've been searching for the right words to tha...,ive search right word thank breather promis wo...
1,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
2,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
3,ham,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent
4,ham,I HAVE A DATE ON SUNDAY WITH WILL!!,date sunday
