In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import re

In [None]:
# get data files
!wget https://cdn.freecodecamp.org/project-data/sms/train-data.tsv
!wget https://cdn.freecodecamp.org/project-data/sms/valid-data.tsv

train_file_path = "train-data.tsv"
test_file_path = "valid-data.tsv"

In [None]:
train_df = pd.read_csv(train_file_path,delimiter='\t')
test_df = pd.read_csv(test_file_path,delimiter='\t')

In [None]:
headers = ["Type","SMS"]

In [None]:
train_df.columns = headers

In [None]:
test_df.columns = headers

In [None]:
test_df

In [None]:
df = pd.concat([train_df,test_df],ignore_index = True)

In [None]:
df.shape

In [None]:
df

Data preprocessing

In [None]:
label_encoder = LabelEncoder()
df['Target'] = label_encoder.fit_transform(df['Type']) # 0=ham # 1 = spam


In [None]:
df

In [None]:
df = df.drop("Type",axis=1)

In [None]:
df

In [None]:
#clean text
def clean_text(text):
    text = text.lower()  #lowercase
    text = re.sub(r'[\W_]+', ' ', text)  # Remove punctuation and special characters
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = text.strip()
    text = re.sub(r'\s+', ' ', text)
    return text


In [None]:
df["SMS"] = df["SMS"].apply(clean_text)

In [None]:
df

In [None]:
#text vectorisation
vectorizer = TfidfVectorizer()
X_train_tf = vectorizer.fit_transform(df["SMS"])

In [None]:
X_train_tf.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_train_tf,df["Target"],test_size=0.2,random_state=42)

In [None]:
#train the model
model = RandomForestClassifier()
model.fit(X_train,y_train)

In [None]:
X_train.shape

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

Logistic Regression

In [None]:
model2 = LogisticRegression()
model2.fit(X_train,y_train)

In [None]:
y_pred2= model2.predict(X_test)

In [None]:
accuracy_score2 = accuracy_score(y_test,y_pred2)


In [None]:
accuracy_score2

In [None]:
#Apply gridsearchCV
param_grid = [
    {'penalty' : ['l1', 'l2'],
    'C' : np.logspace(-1, 1, 4),
    'solver' : ['liblinear'],
    'max_iter' : [100, 200]
    }
]

model_gd = GridSearchCV(model2,param_grid=param_grid, cv = 3, verbose=True, n_jobs=-1)

In [None]:
model_gd.fit(X_train,y_train)

In [None]:
model_gd.best_score_

In [None]:
#prediction function

def predict_sms(text):
    text_tf = vectorizer.transform(text)
    result = model_gd.predict(text_tf)
    #print(result)

    if result == 1:
       return "spam"
    else:
       return "ham"


In [None]:
predict_sms(["log into this website www jkjdjk com there is a surprise for you "])

In [None]:
# Run this cell to test your function and model. Do not modify contents.
def test_predictions():
  test_messages = ["how are you doing today",
                   "sale today! to stop texts call 98912460324",
                   "i dont want to go. can we try it a different day? available sat",
                   "our new mobile video service is live. just install on your phone to start watching.",
                   "you have won £1000 cash! call to claim your prize.",
                   "i'll bring it tomorrow. don't forget the milk.",
                   "wow, is your arm alright. that happened to me one time too"
                  ]

  test_answers = ["ham", "spam", "ham", "spam", "spam", "ham", "ham"]
  passed = True
  prediction = []

  for msg in test_messages:
    prediction.append(predict_sms([msg]))
  print(prediction)

  if prediction != test_answers:
    passed = False

  if passed:
    print("You passed the challenge. Great job!")
  else:
    print("You haven't passed yet. Keep trying.")

test_predictions()
