In [1]:
# pip install neattext

### Importing Packages 

In [2]:
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import neattext as nt


#### <a href="https://www.kaggle.com/team-ai/spam-text-message-classification?select=SPAM+text+message+20170820+-+Data.csv"> Kaggle - SMS Text Message Classification</a>

In [3]:
df = pd.read_csv("SPAM text message 20170820 - Data.csv")
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [4]:
df[df.Category == "spam"]

Unnamed: 0,Category,Message
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
5,spam,FreeMsg Hey there darling it's been 3 week's n...
8,spam,WINNER!! As a valued network customer you have...
9,spam,Had your mobile 11 months or more? U R entitle...
11,spam,"SIX chances to win CASH! From 100 to 20,000 po..."
...,...,...
5537,spam,Want explicit SEX in 30 secs? Ring 02073162414...
5540,spam,ASKED 3MOBILE IF 0870 CHATLINES INCLU IN FREE ...
5547,spam,Had your contract mobile 11 Mnths? Latest Moto...
5566,spam,REMINDER FROM O2: To get 2.50 pounds free call...


In [5]:
# dir(nt)

In [6]:
sms_message = df.iloc[5537].Message
sms_message

'Want explicit SEX in 30 secs? Ring 02073162414 now! Costs 20p/min Gsex POBOX 2667 WC1N 3XX'

In [7]:
textFrame = nt.TextFrame(sms_message)
textFrame

TextFrame(text="Want explicit SEX in 30 secs? Ring 02073162414 now! Costs 20p/min Gsex POBOX 2667 WC1N 3XX")

In [8]:
textFrame.describe()

Key      Value          
Length  : 90             
vowels  : 14             
consonants: 37             
stopwords: 1              
punctuations: 3              
special_char: 3              
tokens(whitespace): 16             
tokens(words): 17             


In [9]:
textFrame = textFrame.remove_stopwords()
textFrame.text

'Want explicit SEX 30 secs? Ring 02073162414 now! Costs 20p/min Gsex POBOX 2667 WC1N 3XX'

In [10]:
textFrame = textFrame.remove_puncts(most_common=False)
textFrame.text

'Want explicit SEX 30 secs Ring 02073162414 now Costs 20pmin Gsex POBOX 2667 WC1N 3XX'

In [11]:
textFrame.remove_puncts(most_common=False).describe()

Key      Value          
Length  : 84             
vowels  : 13             
consonants: 36             
stopwords: 1              
punctuations: 0              
special_char: 0              
tokens(whitespace): 15             
tokens(words): 15             


In [12]:
tc = nt.TextCleaner(sms_message)
tc

TextCleaner(text="Want explicit SEX in 30 secs? Ring 02073162414 now! Costs 20p/min Gsex POBOX 2667 WC1N 3XX")

In [13]:
textFrame.remove_multiple_spaces()

TextFrame(text="Want explicit SEX 30 secs Ring 02073162414 now Costs 20pmin Gsex POBOX 2667 WC1N 3XX")

In [14]:
def train(sparse):
    ham_train, ham_test, spam_train, spam_test = train_test_split(sparse, df.Category, test_size=0.2)    
    clf = LogisticRegression()
    clf.fit(ham_train, spam_train)
    return clf.score(ham_test, spam_test)

In [15]:
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [16]:
count_vectorizer = CountVectorizer()
message_spare_matrix = count_vectorizer.fit_transform(df.Message)
message_spare_matrix

<5572x8709 sparse matrix of type '<class 'numpy.int64'>'
	with 74098 stored elements in Compressed Sparse Row format>

In [17]:
count_vectorizer.vocabulary_

{'go': 3567,
 'until': 8080,
 'jurong': 4370,
 'point': 5954,
 'crazy': 2334,
 'available': 1313,
 'only': 5567,
 'in': 4110,
 'bugis': 1763,
 'great': 3651,
 'world': 8544,
 'la': 4497,
 'buffet': 1761,
 'cine': 2057,
 'there': 7690,
 'got': 3611,
 'amore': 1079,
 'wat': 8320,
 'ok': 5534,
 'lar': 4533,
 'joking': 4338,
 'wif': 8446,
 'oni': 5563,
 'free': 3369,
 'entry': 2959,
 'wkly': 8502,
 'comp': 2174,
 'to': 7802,
 'win': 8459,
 'fa': 3096,
 'cup': 2394,
 'final': 3217,
 'tkts': 7789,
 '21st': 410,
 'may': 4955,
 '2005': 401,
 'text': 7640,
 '87121': 791,
 'receive': 6336,
 'question': 6228,
 'std': 7275,
 'txt': 7982,
 'rate': 6280,
 'apply': 1166,
 '08452810075over18': 77,
 'dun': 2811,
 'say': 6675,
 'so': 7070,
 'early': 2832,
 'hor': 3948,
 'already': 1051,
 'then': 7684,
 'nah': 5266,
 'don': 2720,
 'think': 7705,
 'he': 3801,
 'goes': 3575,
 'usf': 8126,
 'lives': 4688,
 'around': 1217,
 'here': 3852,
 'though': 7725,
 'freemsg': 3376,
 'hey': 3862,
 'darling': 2451,
 'it

In [18]:
train(message_spare_matrix)

0.9829596412556054

In [19]:
# stemmer = SnowballStemmer("english")

## Cleaning  SMS Message Content by:
- Removing stop words
- Removing punctutations and special characters.
- Removing multiple spaces
- Fixing some contractions such as (id -> i would and so on)
- Return the message to the original words "stemming".

In [20]:
def clean_sms_message(message):
    textCleaner = nt.TextCleaner(message)
    textCleaner = textCleaner.remove_puncts(most_common=False)
    textCleaner = textCleaner.remove_stopwords()
    textCleaner = textCleaner.remove_special_characters()
    textCleaner = textCleaner.remove_multiple_spaces()
    textCleaner = textCleaner.fix_contractions()
    return textCleaner

In [26]:
stemmer = SnowballStemmer("english")

In [21]:
def get_stemmed_message(message):
    tokens = message.split()
    filtered_words = []
    for token in tokens:
        filtered_words.append(stemmer.stem(token))
        
    return ' '.join(filtered_words)

In [22]:
df["cleaned_message"] = df.Message.apply(clean_sms_message)
df["cleaned_message"]

0       jurong point crazy available bugis n great wor...
1                                 ok lar joking wif u oni
2       free entry 2 wkly comp win fa cup final tkts 2...
3                                     u dun early hor u c
4                           nah dont think goes usf lives
                              ...                        
5567    2nd time tried 2 contact u u 750 pound prize 2...
5568                            b going esplanade fr home
5569                          pity mood soany suggestions
5570    guy bitching acted like id interested buying w...
5571                                            rofl true
Name: cleaned_message, Length: 5572, dtype: object

In [23]:
df["cleaned_message"] = df.cleaned_message.apply(get_stemmed_message)
df["cleaned_message"]

0       jurong point crazi avail bugi n great world la...
1                                   ok lar joke wif u oni
2       free entri 2 wkli comp win fa cup final tkts 2...
3                                     u dun earli hor u c
4                             nah dont think goe usf live
                              ...                        
5567    2nd time tri 2 contact u u 750 pound prize 2 c...
5568                                b go esplanad fr home
5569                              piti mood soani suggest
5570    guy bitch act like id interest buy week gave free
5571                                            rofl true
Name: cleaned_message, Length: 5572, dtype: object

In [24]:
cleaned_message_spare_matrix = count_vectorizer.fit_transform(df.cleaned_message)
cleaned_message_spare_matrix

<5572x7975 sparse matrix of type '<class 'numpy.int64'>'
	with 42080 stored elements in Compressed Sparse Row format>

In [25]:
train(cleaned_message_spare_matrix)

0.9838565022421525