In [81]:
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from collections import Counter

## USING NLTK

### load dataset

In [82]:
data = pd.read_csv('SMS_DATA.csv', encoding='Windows-1252', index_col='S. No.')

In [83]:
data.head()

Unnamed: 0_level_0,Message_body,Label
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Rofl. Its true to its name,Non-Spam
2,The guy did some bitching but I acted like i'd...,Non-Spam
3,"Pity, * was in mood for that. So...any other s...",Non-Spam
4,Will ü b going to esplanade fr home?,Non-Spam
5,This is the 2nd time we have tried 2 contact u...,Spam


### check null values

In [84]:
data.isnull().sum()

Message_body    0
Label           0
dtype: int64

### Convert to lowercase

In [85]:
def to_lower(input):
    return input.lower()

data['Message_body']= data['Message_body'].apply(to_lower)

as nltk stopwords are in lowercae, i convert my data into lowercase as well 

In [86]:
data.head()

Unnamed: 0_level_0,Message_body,Label
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,rofl. its true to its name,Non-Spam
2,the guy did some bitching but i acted like i'd...,Non-Spam
3,"pity, * was in mood for that. so...any other s...",Non-Spam
4,will ü b going to esplanade fr home?,Non-Spam
5,this is the 2nd time we have tried 2 contact u...,Spam


### remove Punctuation

In [87]:
def remove_punctuation(input):
    return ''.join(word for word in input if word not in string.punctuation) #if it is not a puntuation mark, include in string

data['Message_body']= data['Message_body'].apply(remove_punctuation)

In [64]:
data.head()

Unnamed: 0_level_0,Message_body,Label
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1
1,rofl its true to its name,Non-Spam
2,the guy did some bitching but i acted like id ...,Non-Spam
3,pity was in mood for that soany other suggest...,Non-Spam
4,will ü b going to esplanade fr home,Non-Spam
5,this is the 2nd time we have tried 2 contact u...,Spam


### Tokenization

In [88]:
def tokenize(input):
    return word_tokenize(input) #using nltk builtin word tokenizer 

data['tokens']= data['Message_body'].apply(tokenize)

In [66]:
data.head()

Unnamed: 0_level_0,Message_body,Label,tokens
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,rofl its true to its name,Non-Spam,"[rofl, its, true, to, its, name]"
2,the guy did some bitching but i acted like id ...,Non-Spam,"[the, guy, did, some, bitching, but, i, acted,..."
3,pity was in mood for that soany other suggest...,Non-Spam,"[pity, was, in, mood, for, that, soany, other,..."
4,will ü b going to esplanade fr home,Non-Spam,"[will, ü, b, going, to, esplanade, fr, home]"
5,this is the 2nd time we have tried 2 contact u...,Spam,"[this, is, the, 2nd, time, we, have, tried, 2,..."


### Stop word removal

In [89]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nooru\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [91]:
stop_word_eng = set(stopwords.words("english"))

def remove_stopword(tokens):
    filtered = [] 
    for word in tokens:
        if word not in stop_word_eng:
            filtered.append(word) #eppend to list if not part of stop word listc 
    return filtered

data['clean_tokens'] = data['tokens'].apply(remove_stopword)
data.head()

Unnamed: 0_level_0,Message_body,Label,tokens,clean_tokens
S. No.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,rofl its true to its name,Non-Spam,"[rofl, its, true, to, its, name]","[rofl, true, name]"
2,the guy did some bitching but i acted like id ...,Non-Spam,"[the, guy, did, some, bitching, but, i, acted,...","[guy, bitching, acted, like, id, interested, b..."
3,pity was in mood for that soany other suggest...,Non-Spam,"[pity, was, in, mood, for, that, soany, other,...","[pity, mood, soany, suggestions]"
4,will ü b going to esplanade fr home,Non-Spam,"[will, ü, b, going, to, esplanade, fr, home]","[ü, b, going, esplanade, fr, home]"
5,this is the 2nd time we have tried 2 contact u...,Spam,"[this, is, the, 2nd, time, we, have, tried, 2,...","[2nd, time, tried, 2, contact, u, u, £750, pou..."


In [92]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nooru\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Stemming and Lemmatization

##### stemming

- convert a word to its base form
- not grammar aware (may reduce to a word that might be wrong/ doesnot exist)

##### Lemmatization

- reduces word to its base form
- considers context and grammar
- returns valid words 

In [93]:
stem = PorterStemmer()
wnl = WordNetLemmatizer()

data['stemmed'] = data['clean_tokens'].apply(lambda tokens: [stem.stem(w) for w in tokens])

data['lemmatized'] = data['clean_tokens'].apply(lambda tokens: [wnl.lemmatize(w) for w in tokens])


In [71]:
data['stemmed']

S. No.
1                                     [rofl, true, name]
2      [guy, bitch, act, like, id, interest, buy, som...
3                           [piti, mood, soani, suggest]
4                         [ü, b, go, esplanad, fr, home]
5      [2nd, time, tri, 2, contact, u, u, £750, pound...
                             ...                        
953    [how, favourit, person, today, r, u, workin, h...
954                                   [much, got, clean]
955             [sorri, da, gone, mad, mani, pend, work]
956                               [wat, time, ü, finish]
957                                         [glad, talk]
Name: stemmed, Length: 957, dtype: object

In [94]:
data['lemmatized']

S. No.
1                                     [rofl, true, name]
2      [guy, bitching, acted, like, id, interested, b...
3                        [pity, mood, soany, suggestion]
4                     [ü, b, going, esplanade, fr, home]
5      [2nd, time, tried, 2, contact, u, u, £750, pou...
                             ...                        
953    [hows, favourite, person, today, r, u, workin,...
954                                [much, got, cleaning]
955          [sorry, da, gone, mad, many, pending, work]
956                               [wat, time, ü, finish]
957                                      [glad, talking]
Name: lemmatized, Length: 957, dtype: object

### Word frequency distribution

In [95]:

all_words = [word for tokens in data['lemmatized'] for word in tokens]

word_freq = Counter(all_words)

print(word_freq.most_common(10))


[('u', 210), ('call', 115), ('im', 95), ('2', 83), ('get', 69), ('ur', 66), ('4', 61), ('go', 56), ('free', 47), ('time', 42)]


In [96]:

all_words = [word for tokens in data['clean_tokens'] for word in tokens]

word_freq = Counter(all_words)
print(word_freq.most_common(10))


[('u', 193), ('call', 111), ('im', 95), ('2', 83), ('get', 67), ('ur', 66), ('4', 61), ('go', 54), ('free', 47), ('ok', 41)]


In [97]:
word_freq

Counter({'u': 193,
         'call': 111,
         'im': 95,
         '2': 83,
         'get': 67,
         'ur': 66,
         '4': 61,
         'go': 54,
         'free': 47,
         'ok': 41,
         'dont': 39,
         'ltgt': 39,
         'like': 38,
         'got': 38,
         'know': 38,
         'time': 37,
         'want': 35,
         'day': 35,
         'see': 34,
         'good': 34,
         'oh': 34,
         'come': 32,
         'stop': 32,
         'later': 31,
         'ü': 30,
         'home': 30,
         'one': 30,
         'ill': 29,
         'back': 29,
         'tell': 29,
         'mobile': 29,
         'need': 28,
         'send': 28,
         'going': 27,
         'pls': 27,
         'r': 26,
         'week': 25,
         'lor': 25,
         'love': 25,
         'give': 24,
         'txt': 24,
         'text': 23,
         'well': 23,
         'still': 23,
         'phone': 23,
         'today': 23,
         'claim': 22,
         'great': 22,
         'hey':

## USING SPACY

In [76]:
import spacy

### load data

In [77]:
nlp = spacy.load("en_core_web_sm")

spacy.load, Loads a Pretrained Language Model which is a small English model provided by spaCy.

It includes:
- Tokenizer = splits a big sentence into smaller pieces
- Part-of-speech tagger = identifies noun, adjective, verb etc
- Lemmatizer = reduces words to base form
- Named Entity Recognizer = identifies real life nouns
- Dependency Parser = identifies word relations and parts of sentences

The nlp object becomes your NLP processing pipeline.When you pass text to it it tokenizes, applies POS tagging, lemmatizaton and NER to text


In [108]:
df = pd.read_csv('SMS_DATA.csv', encoding='Windows-1252', index_col='S. No.')

### process text using spacy

In [110]:
def process_text(text):
    text_clean = text.lower().translate(str.maketrans('', '', string.punctuation)) #conevrt to lowercase and remove Punctuations
    processed = nlp(text_clean)
    lemmas = [token.lemma_ for token in processed if not token.is_stop and not token.is_punct and not token.is_space]
    return lemmas

df['lemmas'] = df['Message_body'].apply(process_text)


In [111]:
df['lemmas']

S. No.
1                                           [rofl, true]
2      [guy, bitching, act, like, d, interested, buy,...
3                        [pity, mood, soany, suggestion]
4                        [ü, b, go, esplanade, fr, home]
5      [2nd, time, try, 2, contact, u, u, win, £, 750...
                             ...                        
953    [s, favourite, person, today, r, u, workin, ha...
954                                         [get, clean]
955                    [sorry, da, go, mad, pende, work]
956                               [wat, time, ü, finish]
957                                         [glad, talk]
Name: lemmas, Length: 957, dtype: object

### Word distribution

In [112]:
all_lemmas

['rofl',
 'true',
 'guy',
 'bitching',
 'act',
 'like',
 'd',
 'interested',
 'buy',
 'week',
 'give',
 'free',
 'pity',
 'mood',
 'soany',
 'suggestion',
 'ü',
 'b',
 'go',
 'esplanade',
 'fr',
 'home',
 '2nd',
 'time',
 'try',
 '2',
 'contact',
 'u',
 'u',
 'win',
 '£',
 '750',
 'pound',
 'prize',
 '2',
 'claim',
 'easy',
 '087187272008',
 'now1',
 '10p',
 'minute',
 'btnationalrate',
 'reminder',
 'o2',
 '250',
 'pound',
 'free',
 'credit',
 'detail',
 'great',
 'offer',
 'pls',
 'reply',
 '2',
 'text',
 'valid',
 'house',
 'postcode',
 'huh',
 'y',
 'lei',
 'not',
 'wait',
 'til',
 'wednesday',
 'ard',
 '6',
 'like',
 'dat',
 'lor',
 'ok',
 'lor',
 'sony',
 'ericsson',
 'salesman',
 'ask',
 'shuhui',
 'gd',
 '2',
 'use',
 'consider',
 'dump',
 'heap',
 'mom',
 'decide',
 'come',
 'lowes',
 'bore',
 'lor',
 'juz',
 'lor',
 'not',
 'ltgt',
 'hour',
 'imma',
 'flip',
 'shit',
 'sorry',
 'ill',
 'later',
 'mean',
 'calculation',
 'ltgt',
 'unit',
 'ltgt',
 'school',
 'expensive',
 'sta

In [113]:
all_lemmas = [lemma for lemmas_list in df['lemmas'] for lemma in lemmas_list]
freq_dist = Counter(all_lemmas)

print("Top 10 frequent words:")
for word, freq in freq_dist.most_common(10):
    print(f"{word}: {freq}")

Top 10 frequent words:
u: 193
not: 115
m: 101
2: 83
ur: 66
s: 64
4: 61
£: 54
come: 49
free: 47


In [103]:
import urduhack



TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.19.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


ModuleNotFoundError: No module named 'keras.src.engine'

In [102]:
import sys
!{sys.executable} -m pip install urduhack


Collecting urduhack
  Using cached urduhack-1.1.1-py3-none-any.whl.metadata (7.2 kB)
Collecting tf2crf (from urduhack)
  Using cached tf2crf-0.1.33-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting tensorflow-datasets~=3.1 (from urduhack)
  Using cached tensorflow_datasets-3.2.1-py3-none-any.whl.metadata (4.8 kB)
Collecting Click~=7.1 (from urduhack)
  Using cached click-7.1.2-py2.py3-none-any.whl.metadata (2.9 kB)
Collecting absl-py (from tensorflow-datasets~=3.1->urduhack)
  Using cached absl_py-2.3.0-py3-none-any.whl.metadata (2.4 kB)
Collecting attrs>=18.1.0 (from tensorflow-datasets~=3.1->urduhack)
  Using cached attrs-25.3.0-py3-none-any.whl.metadata (10 kB)
Collecting dill (from tensorflow-datasets~=3.1->urduhack)
  Using cached dill-0.4.0-py3-none-any.whl.metadata (10 kB)
Collecting future (from tensorflow-datasets~=3.1->urduhack)
  Using cached future-1.0.0-py3-none-any.whl.metadata (4.0 kB)
Collecting promise (from tensorflow-datasets~=3.1->urduhack)
  Using cached promise-2.

  DEPRECATION: Building 'promise' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'promise'. Discussion can be found at https://github.com/pypa/pip/issues/6334
  You can safely remove it manually.
  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
typer 0.16.0 requires click>=8.0.0, but you have click 7.1.2 which is incompatible.
