In [1]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [42]:
!pip install nltk

import sys 
# Imported to handle natural language processing
import nltk
# Imported for machine learning classifiers
import sklearn
# Imported to store the data
import pandas
import numpy

print("To check the version:")
print("Python:",(sys.version))
print("NLTK:", (nltk.__version__))
print("Scikit-learn:",(sklearn.__version__))
print("Pandas:", (pandas.__version__))
print("Numpy:", (numpy.__version__))

To check the version:
Python: 3.10.4 (tags/v3.10.4:9d38120, Mar 23 2022, 23:13:41) [MSC v.1929 64 bit (AMD64)]
NLTK: 3.7
Scikit-learn: 1.1.0
Pandas: 1.4.2
Numpy: 1.22.3


You should consider upgrading via the 'C:\Users\Niranjana S.Avilery\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


## 1. Load the Dataset

In [43]:
import pandas as pd
import numpy as np

# loading the dataset into the dataframe
dataFrame = pd.read_table('https://raw.githubusercontent.com/niranjana1997/NLP_Text-Classification-with-NLTK-and-Scikit-learn/main/dataset/SMSSpamCollection', header=None, encoding='utf-8')

In [44]:
# printing information about the dataset
print(dataFrame.info())
print(dataFrame.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       5572 non-null   object
 1   1       5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
      0                                                  1
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [45]:
# check class distribution
print(dataFrame[0].value_counts())

# Conclusion: There's a class imbalance

ham     4825
spam     747
Name: 0, dtype: int64


## 2. Data Preprocessing

In [46]:
# Converting class labels to numerical values - (0:ham, 1:spam)

# importing sklearn's LabelEncoding to convert labels
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
numerical_values = encoder.fit_transform(df[0])
dataFrame[0] = numerical_values
print(dataFrame)

      0                                                  1
0     0  Go until jurong point, crazy.. Available only ...
1     0                      Ok lar... Joking wif u oni...
2     1  Free entry in 2 a wkly comp to win FA Cup fina...
3     0  U dun say so early hor... U c already then say...
4     0  Nah I don't think he goes to usf, he lives aro...
...  ..                                                ...
5567  1  This is the 2nd time we have tried 2 contact u...
5568  0               Will ü b going to esplanade fr home?
5569  0  Pity, * was in mood for that. So...any other s...
5570  0  The guy did some bitching but I acted like i'd...
5571  0                         Rofl. Its true to its name

[5572 rows x 2 columns]


In [47]:
# store the Text message data in a variable
text_message = dataFrame[1]
print(text_message)

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: 1, Length: 5572, dtype: object


2.1 Regular Expressions

Some common regular expression metacharacters - copied from wikipedia

^ Matches the starting position within the string. In line-based tools, it matches the starting position of any line.

. Matches any single character (many applications exclude newlines, and exactly which characters are considered newlines is flavor-, character-encoding-, and platform-specific, but it is safe to assume that the line feed character is included). Within POSIX bracket expressions, the dot character matches a literal dot. For example, a.c matches "abc", etc., but [a.c] matches only "a", ".", or "c".

[ ] A bracket expression. Matches a single character that is contained within the brackets. For example, [abc] matches "a", "b", or "c". [a-z] specifies a range which matches any lowercase letter from "a" to "z". These forms can be mixed: [abcx-z] matches "a", "b", "c", "x", "y", or "z", as does [a-cx-z]. The - character is treated as a literal character if it is the last or the first (after the ^, if present) character within the brackets: [abc-], [-abc]. Note that backslash escapes are not allowed. The ] character can be included in a bracket expression if it is the first (after the ^) character: []abc].

[^ ] Matches a single character that is not contained within the brackets. For example, [^abc] matches any character other than "a", "b", or "c". [^a-z] matches any single character that is not a lowercase letter from "a" to "z". Likewise, literal characters and ranges can be mixed.

$ Matches the ending position of the string or the position just before a string-ending newline. In line-based tools, it matches the ending position of any line.

( ) Defines a marked subexpression. The string matched within the parentheses can be recalled later (see the next entry, \n). A marked subexpression is also called a block or capturing group. BRE mode requires ( ).

\n Matches what the nth marked subexpression matched, where n is a digit from 1 to 9. This construct is vaguely defined in the POSIX.2 standard. Some tools allow referencing more than nine capturing groups.

* Matches the preceding element zero or more times. For example, abc matches "ac", "abc", "abbbc", etc. [xyz] matches "", "x", "y", "z", "zx", "zyx", "xyzzy", and so on. (ab)* matches "", "ab", "abab", "ababab", and so on.

{m,n} Matches the preceding element at least m and not more than n times. For example, a{3,5} matches only "aaa", "aaaa", and "aaaaa". This is not found in a few older instances of regexes. BRE mode requires {m,n}.

In [48]:
# Regular Expressions

# For eg. replace Email Address with 'email', URLs with 'webaddress',
# money symbols, 10 digit numbers are replaced with a generic placeholder

# replacing email addresses with 'email' with the help of regexlib.com
processed = text_message.str.replace(r'^.+@[^\.].*\.[a-z]{2,}$','email')

# replacing URLs with 'webaddress' with the help of regexlib.com
processed = processed.str.replace(r'^http\://[a-zA-Z0-9\-\.]+\.[a-zA-Z]{2,3}(/\S*)?$', 'webaddress')

# Replace money symbols with 'moneysymbol'
processed = processed.str.replace(r'£|\$','moneysymbol')
    
# Replace 10 digit phone numbers with 'phone'
processed = processed.str.replace(r'^\(?[\d]{3}\)?[\s-]?[\d]{3}[\s-]?[\d]{4}$','phone')
    
# Replace numbers with 'number'
processed = processed.str.replace(r'\d+(\.\d+)?','number')

In [49]:
# removing punctuations
processed = processed.str.replace(r'[^\w\d\s]',' ')

In [50]:
# replacing double space between texts with single space
processed = processed.str.replace(r'\s+',' ')

# remove leading and trailing whitespace
processed = processed.str.replace(r'^\s+|\s+?$','')

In [51]:
# change words to lowercase
processed = processed.str.lower()

In [52]:
# printing the dataframe after text pre-processing
print(processed)

0       go until jurong point crazy available only in ...
1                                 ok lar joking wif u oni
2       free entry in number a wkly comp to win fa cup...
3             u dun say so early hor u c already then say
4       nah i don t think he goes to usf he lives arou...
                              ...                        
5567    this is the numbernd time we have tried number...
5568                  will ü b going to esplanade fr home
5569    pity was in mood for that so any other suggest...
5570    the guy did some bitching but i acted like i d...
5571                            rofl its true to its name
Name: 1, Length: 5572, dtype: object


In [53]:
# removing stop words from the dataset

from nltk.corpus import stopwords
nltk.download('stopwords')
stops = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(
    term for term in x.split() if term not in stops))

[nltk_data] Downloading package stopwords to C:\Users\Niranjana
[nltk_data]     S.Avilery\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [39]:
# remove word stems using a Porter Stemmer

porterStemmer = nltk.PorterStemmer()
processed = processed.apply(lambda x: ' '.join(
    porterStemmer.stem(term) for term in x.split()))