In [1]:
# Computational Linguistics example with Machine Learning

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('/home/mitu/SMSSpamCollection', 
                 sep ='\t', names=['label','text'])

In [4]:
df.shape

(5572, 2)

In [5]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.columns

Index(['label', 'text'], dtype='object')

In [7]:
# input data
x = df['text']

# output data
y = df['label']

In [8]:
# Total messages in each category
y.value_counts()

ham     4825
spam     747
Name: label, dtype: int64

In [9]:
sent = 'Hello friends! Do you like learning Python Programming?'

In [10]:
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string

swords = stopwords.words('english')
ps = PorterStemmer()

In [11]:
def clean_text(sent):
    text = [word for word in word_tokenize(sent) 
            if word not in string.punctuation]
    text = [ps.stem(word.lower()) for word in text 
            if word.lower() not in swords]
    return text

In [12]:
z = 'It is raining here. Do not go out.'
clean_text(z)

['rain', 'go']

In [13]:
x

0       Go until jurong point, crazy.. Available only ...
1                           Ok lar... Joking wif u oni...
2       Free entry in 2 a wkly comp to win FA Cup fina...
3       U dun say so early hor... U c already then say...
4       Nah I don't think he goes to usf, he lives aro...
                              ...                        
5567    This is the 2nd time we have tried 2 contact u...
5568                 Will ü b going to esplanade fr home?
5569    Pity, * was in mood for that. So...any other s...
5570    The guy did some bitching but I acted like i'd...
5571                           Rofl. Its true to its name
Name: text, Length: 5572, dtype: object

In [14]:
x.apply(lambda data: clean_text(data))

0       [go, jurong, point, crazi, .., avail, bugi, n,...
1                  [ok, lar, ..., joke, wif, u, oni, ...]
2       [free, entri, 2, wkli, comp, win, fa, cup, fin...
3       [u, dun, say, earli, hor, ..., u, c, alreadi, ...
4       [nah, n't, think, goe, usf, live, around, though]
                              ...                        
5567    [2nd, time, tri, 2, contact, u., u, £750, poun...
5568                       [ü, b, go, esplanad, fr, home]
5569                           [piti, mood, ..., suggest]
5570    [guy, bitch, act, like, 'd, interest, buy, som...
5571                                   [rofl, true, name]
Name: text, Length: 5572, dtype: object

In [15]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
# create the object
tfidf = TfidfVectorizer(analyzer=clean_text)

In [17]:
# apply the vectorization
x_new = tfidf.fit_transform(x)

In [18]:
x_new.shape

(5572, 7953)

In [19]:
y.shape

(5572,)

In [20]:
# Cross-validation
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(
    x_new, y, test_size=0.25, random_state=0)

In [22]:
x_train.shape

(4179, 7953)

In [23]:
x_test.shape

(1393, 7953)

In [24]:
# Import the Machine Learning Algorithm
from sklearn.ensemble import RandomForestClassifier

In [25]:
# Create the object
model = RandomForestClassifier(random_state=0)

In [26]:
# Train the algorithm
model.fit(x_train, y_train)

RandomForestClassifier(random_state=0)

In [27]:
# Perform the predictions on unseen data
y_pred = model.predict(x_test)

In [28]:
y_pred

array(['ham', 'spam', 'ham', ..., 'spam', 'ham', 'ham'], dtype=object)

In [29]:
len(y_pred)

1393

In [30]:
y_test.value_counts()

ham     1208
spam     185
Name: label, dtype: int64

In [31]:
result = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

In [32]:
result

Unnamed: 0,Actual,Predicted
4456,ham,ham
690,spam,spam
944,ham,ham
3768,ham,ham
1189,ham,ham
...,...,...
1889,ham,ham
2250,spam,spam
2915,spam,spam
1282,ham,ham


In [33]:
from sklearn.metrics import accuracy_score, classification_report

In [34]:
accuracy_score(y_test, y_pred) * 100

97.70279971284997

In [35]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.97      1.00      0.99      1208
        spam       1.00      0.83      0.91       185

    accuracy                           0.98      1393
   macro avg       0.99      0.91      0.95      1393
weighted avg       0.98      0.98      0.98      1393



In [36]:
# Prediction on new messages
# Download the file:   sample.csv    from  https://mitu.co.in/dataset

In [37]:
newdata = pd.read_csv('sample.csv', sep='\t', names=['text'])

In [38]:
newdata.columns

Index(['text'], dtype='object')

In [39]:
# Transform the new data into tfidf vectorization format
# Exceute only once
newval = tfidf.transform(newdata['text'])

In [40]:
model.predict(newval)

array(['ham', 'spam', 'ham'], dtype=object)

In [41]:
newdata

Unnamed: 0,text
0,Ok lar i double check wif da hair dresser alre...
1,"As a valued customer, I am pleased to advise y..."
2,"Today is ""song dedicated day.."" Which song wil..."


In [42]:
# Save the object of classifier and vectorizer
import joblib

In [43]:
joblib.dump(model,'classifier.model')
joblib.dump(tfidf,'tfidf.model')

['tfidf.model']