In [1]:
# Text Mining algorithms on unstructured dataset

In [2]:
import pandas as pd

In [6]:
df = pd.read_csv('SMSSpamCollection', sep='\t',
                names = ['class','body_text'])

In [7]:
df

Unnamed: 0,class,body_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [8]:
import string

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
# Function to count the punctuation symbols

def count_punct(text):
    count = sum([1 for x in text if x in string.punctuation])
    return(round(count/(len(text)-text.count(' '))*100,2))

In [11]:
s = 'Hello, friends! How are you? Welcome to Pune.!!!'

In [12]:
count_punct(s)

17.07

In [13]:
# Add feature of punctuation percentages
df['punct%'] = df['body_text'].apply(lambda x: count_punct(x))

In [14]:
df

Unnamed: 0,class,body_text,punct%
0,ham,"Go until jurong point, crazy.. Available only ...",9.78
1,ham,Ok lar... Joking wif u oni...,25.00
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,4.69
3,ham,U dun say so early hor... U c already then say...,15.38
4,ham,"Nah I don't think he goes to usf, he lives aro...",4.08
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,6.11
5568,ham,Will ü b going to esplanade fr home?,3.45
5569,ham,"Pity, * was in mood for that. So...any other s...",14.58
5570,ham,The guy did some bitching but I acted like i'd...,1.00


In [15]:
# Add the column body length to it
df['body_len'] = df['body_text'].apply(lambda x: len(x) - x.count(" "))

In [16]:
df

Unnamed: 0,class,body_text,punct%,body_len
0,ham,"Go until jurong point, crazy.. Available only ...",9.78,92
1,ham,Ok lar... Joking wif u oni...,25.00,24
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,4.69,128
3,ham,U dun say so early hor... U c already then say...,15.38,39
4,ham,"Nah I don't think he goes to usf, he lives aro...",4.08,49
...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,6.11,131
5568,ham,Will ü b going to esplanade fr home?,3.45,29
5569,ham,"Pity, * was in mood for that. So...any other s...",14.58,48
5570,ham,The guy did some bitching but I acted like i'd...,1.00,100


In [17]:
from nltk.corpus import stopwords

In [18]:
s_words = stopwords.words('english')

In [20]:
s_words;

In [21]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [22]:
# analyzer function
def clean_text(text):
    data = [x for x in text if x not in string.punctuation]
    data = "".join(data)
    data = [ps.stem(x) for x in data.split() if x not in s_words]
    return data

In [23]:
clean_text(s)

['hello', 'friend', 'how', 'welcom', 'pune']

In [24]:
# Seperate the input and output
X = df.drop('class', axis = 1)
y = df['class']

In [25]:
X

Unnamed: 0,body_text,punct%,body_len
0,"Go until jurong point, crazy.. Available only ...",9.78,92
1,Ok lar... Joking wif u oni...,25.00,24
2,Free entry in 2 a wkly comp to win FA Cup fina...,4.69,128
3,U dun say so early hor... U c already then say...,15.38,39
4,"Nah I don't think he goes to usf, he lives aro...",4.08,49
...,...,...,...
5567,This is the 2nd time we have tried 2 contact u...,6.11,131
5568,Will ü b going to esplanade fr home?,3.45,29
5569,"Pity, * was in mood for that. So...any other s...",14.58,48
5570,The guy did some bitching but I acted like i'd...,1.00,100


In [26]:
# Import tfidf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [27]:
tfidf = TfidfVectorizer(analyzer=clean_text)

In [28]:
X_trans = tfidf.fit_transform(X['body_text'])

In [29]:
X_trans.shape

(5572, 8277)

In [30]:
X_vect = pd.concat([X[['body_len', 'punct%']]
                          .reset_index(drop=True), 
                          pd.DataFrame(X_trans.toarray())], axis=1)

In [32]:
X_vect.shape

(5572, 8279)

In [33]:
y.value_counts()

ham     4825
spam     747
Name: class, dtype: int64

In [34]:
X_vect.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Columns: 8279 entries, body_len to 8276
dtypes: float64(8278), int64(1)
memory usage: 351.9 MB


In [35]:
# Cross validation
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X_vect, y, stratify=y, random_state=0) 

In [36]:
X_train.shape

(4179, 8279)

In [37]:
from sklearn.ensemble import RandomForestClassifier

In [38]:
clf = RandomForestClassifier(random_state=0)

In [39]:
clf.fit(X_train, y_train)

RandomForestClassifier(random_state=0)

In [40]:
y_pred = clf.predict(X_test)

In [41]:
from sklearn.metrics import accuracy_score, classification_report

In [42]:
accuracy_score(y_test, y_pred)

0.9662598707824839

In [43]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ham       0.96      1.00      0.98      1206
        spam       1.00      0.75      0.86       187

    accuracy                           0.97      1393
   macro avg       0.98      0.87      0.92      1393
weighted avg       0.97      0.97      0.96      1393



In [44]:
new = pd.read_csv("sample.csv", 
                   names=['body_text'], sep='\t')

In [45]:
new

Unnamed: 0,body_text
0,Ok lar i double check wif da hair dresser alre...
1,"As a valued customer, I am pleased to advise y..."
2,"Today is ""song dedicated day.."" Which song wil..."


In [46]:
new['body_len'] = new['body_text'].apply(lambda x: len(x) - x.count(" "))
new['punct%'] = new['body_text'].apply(lambda x: count_punct(x))

In [47]:
new_vect = tfidf.transform(new['body_text'])

In [48]:
sample_vect = new
sample_vect = pd.concat([new[['body_len', 'punct%']].reset_index(drop=True), 
           pd.DataFrame(new_vect.toarray())], axis=1)

In [49]:
sample_vect.shape

(3, 8279)

In [50]:
sample_vect

Unnamed: 0,body_len,punct%,0,1,2,3,4,5,6,7,...,8267,8268,8269,8270,8271,8272,8273,8274,8275,8276
0,89,4.49,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,125,2.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,102,9.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
clf.predict(sample_vect)

array(['ham', 'spam', 'ham'], dtype=object)