In [24]:
# SCI-KIT LEARN!!!

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

In [14]:
df = pd.read_csv('nlp_course_notes/TextFiles/smsspamcollection.tsv',sep='\t') # this file is seperated by tabs for each column

df.head() # print the first 5 rows

Unnamed: 0,label,message,length,punct
0,ham,"Go until jurong point, crazy.. Available only ...",111,9
1,ham,Ok lar... Joking wif u oni...,29,6
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,155,6
3,ham,U dun say so early hor... U c already then say...,49,6
4,ham,"Nah I don't think he goes to usf, he lives aro...",61,2


In [19]:
is_empty = df.isnull().sum() # if everything is 0 (false) then there is no missing data

label_count = df['label'].value_counts() # number of times each label shows up

# X is the features
# Y is the labels

x = df[['length','punct']]
y = df['label']

# randomly split the data in a 70:30 ratio
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42) 

lr_model = LogisticRegression(solver='lbfgs') # make a logistic regression model (FOR CLASSIFICATION)
# solver is just the algorithm it uses for optimizing the model parameters

lr_model.fit(x_train,y_train) # train the model with training data

predictions = lr_model.predict(x_test) # use the model to make predictions on the test data

cm = pd.DataFrame(metrics.confusion_matrix(y_test,predictions),index=['actual ham','actual spam'],columns=['predicted ham','predicted spam']) # print the confusion matrix to evaluate the results
print(cm)

print(metrics.classification_report(y_test,predictions)) # print the various metrics for accuracy, like recall and precision

# the overall syntax for these types of models is: import model, build model, fit model, predict with model

             predicted ham  predicted spam
actual ham            1404              44
actual spam            219               5
              precision    recall  f1-score   support

         ham       0.87      0.97      0.91      1448
        spam       0.10      0.02      0.04       224

   micro avg       0.84      0.84      0.84      1672
   macro avg       0.48      0.50      0.48      1672
weighted avg       0.76      0.84      0.80      1672



In [30]:
# FEATURE EXTRACTION (TRANSFORMING TEXT INTO VECTORS)

df = pd.read_csv('nlp_course_notes/TextFiles/smsspamcollection.tsv',sep='\t') # this file is seperated by tabs for each column

X = df['message']
y = df['label']

# randomly split the data in a 70:30 ratio
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=42) 

count_vect = CountVectorizer()

# first fit the vectorizer to the training data (build the vocabulary and count the number of words)
# then transform the text into vectors, based on the count
X_train_counts = count_vect.fit_transform(X_train)

tfidf_transformer = TfidfTransformer() # convert word count to word frequency and multiply by the inverse document frequency
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # tf-idf transformation on the training data

vectorizer = TfidfVectorizer() # combine the count vectorizer and tf-idf steps
X_train_tfidf = vectorizer.fit_transform(X_train) # vectorize the training data

# after vectorizing the training data, make the model

clf = LinearSVC() # linear support vector classifier model
clf.fit(X_train_tfidf,y_train)

# this is a pipeline of steps/actions, which is often used in NLP to make things simpler/repeatable
# the first step is the word vectorization, the next step is the model classification
# in this way, we don't have to vectorize the training and test data two separate times
# this pipeline object is basically like any other scikit-learn model
text_clf = Pipeline([('tfidf',TfidfVectorizer()),('clf',LinearSVC())])

# Send the training data through the pipeline
text_clf.fit(X_train, y_train)

predictions = text_clf.predict(X_test)

cm = pd.DataFrame(metrics.confusion_matrix(y_test,predictions),index=['actual ham','actual spam'],columns=['predicted ham','predicted spam']) # print the confusion matrix to evaluate the results
print(cm) # we see that the results are much better since we've incorporated the message feature in our predictions

# predict a new message as SPAM or HAM
test1 = text_clf.predict(["Hi, how are you doing today?"])
test2 = text_clf.predict(["Congratulations! You've been selected as a winner for the new iPhone 13!"])

print(test1)
print(test2)

             predicted ham  predicted spam
actual ham            1445               3
actual spam             10             214
['ham']
['ham']


In [None]:
# DATA FILTERING / FORMATTING

df.dropna(inplace=True) # removes empty rows (rows with missing data)

blanks = []

# (index, label, review text)
for i,lb,rv in df.itertuples():
    if rv.isspace(): # if the review is empty whitespace
        blanks.append(i) # add the index of these rows to our list

df.dropna(blanks,inplace=True) # removes the rows with reviews that just have whitespaces