In [3]:
# Install dependencies
!pip install xgboost

# Upload the dataset manually
from google.colab import files
uploaded = files.upload()

# Load the uploaded file (assuming it's '1.csv')
import pandas as pd
data = pd.read_csv('1.csv', encoding='latin')




Saving 1.csv to 1.csv


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score, accuracy_score, matthews_corrcoef
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import joblib
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV

# Download stopwords
nltk.download('stopwords')

# Preprocess dataset
data.rename(columns={'v1': 'Class', 'v2': 'Text'}, inplace=True)
data['numClass'] = data['Class'].map({'ham': 0, 'spam': 1})
data['Count'] = 0
for i in np.arange(0, len(data.Text)):
    data.loc[i, 'Count'] = len(data.loc[i, 'Text'])

# Exploratory analysis
print("Unique values in the Class set: ", data.Class.unique())
ham = data[data.numClass == 0]
print("Number of ham messages in data set:", ham['Class'].count())
spam = data[data.numClass == 1]
print("Number of spam messages in data set:", spam['Class'].count())

# Text vectorization with TF-IDF
stopset = list(stopwords.words("english"))
vectorizer = TfidfVectorizer(stop_words=stopset)
X = vectorizer.fit_transform(data.Text)
y = data.numClass

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, test_size=0.2, random_state=42
)

print("Shape of feature matrix:", X.shape)
print("Training set shape:", X_train.shape)
print("Test set shape:", X_test.shape)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unique values in the Class set:  ['ham' 'spam']
Number of ham messages in data set: 4825
Number of spam messages in data set: 747
Shape of feature matrix: (5572, 8536)
Training set shape: (4457, 8536)
Test set shape: (1115, 8536)


In [5]:
# Linear SVC (replacing KNN)
from sklearn.svm import LinearSVC
svc = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000)  # Regularized with C=0.1
svc.fit(X_train, y_train)

y_train_pred = svc.predict(X_train)
y_test_pred = svc.predict(X_test)

svc_train_accuracy = accuracy_score(y_train, y_train_pred)
svc_train_mcc = matthews_corrcoef(y_train, y_train_pred)
svc_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

svc_test_accuracy = accuracy_score(y_test, y_test_pred)
svc_test_mcc = matthews_corrcoef(y_test, y_test_pred)
svc_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('SVC Model performance for Training set')
print('- Accuracy: %s' % svc_train_accuracy)
print('- MCC: %s' % svc_train_mcc)
print('- F1 score: %s' % svc_train_f1)
print('----------------------------------')
print('SVC Model performance for Test set')
print('- Accuracy: %s' % svc_test_accuracy)
print('- MCC: %s' % svc_test_mcc)
print('- F1 score: %s' % svc_test_f1)

SVC Model performance for Training set
- Accuracy: 0.9910253533767108
- MCC: 0.9624602343369298
- F1 score: 0.9911116658050965
----------------------------------
SVC Model performance for Test set
- Accuracy: 0.9766816143497757
- MCC: 0.8978004830009131
- F1 score: 0.9764781384666793


In [6]:
# Decision Tree (already constrained)
dt = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
dt.fit(X_train, y_train)

y_train_pred = dt.predict(X_train)
y_test_pred = dt.predict(X_test)

dt_train_accuracy = accuracy_score(y_train, y_train_pred)
dt_train_mcc = matthews_corrcoef(y_train, y_train_pred)
dt_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

dt_test_accuracy = accuracy_score(y_test, y_test_pred)
dt_test_mcc = matthews_corrcoef(y_test, y_test_pred)
dt_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('DT Model performance for Training set')
print('- Accuracy: %s' % dt_train_accuracy)
print('- MCC: %s' % dt_train_mcc)
print('- F1 score: %s' % dt_train_f1)
print('----------------------------------')
print('DT Model performance for Test set')
print('- Accuracy: %s' % dt_test_accuracy)
print('- MCC: %s' % dt_test_mcc)
print('- F1 score: %s' % dt_test_f1)

DT Model performance for Training set
- Accuracy: 0.9450302894323536
- MCC: 0.7684464116954787
- F1 score: 0.9455772888332558
----------------------------------
DT Model performance for Test set
- Accuracy: 0.9488789237668162
- MCC: 0.7837994593142213
- F1 score: 0.9493726116565195


In [7]:
# Random Forest (relaxed regularization)
rf = RandomForestClassifier(
    n_estimators=50,  # Increased from 30
    max_depth=30,     # Increased from 20
    min_samples_split=5,
    class_weight='balanced'
)
rf.fit(X_train, y_train)

y_train_pred = rf.predict(X_train)
y_test_pred = rf.predict(X_test)

rf_train_accuracy = accuracy_score(y_train, y_train_pred)
rf_train_mcc = matthews_corrcoef(y_train, y_train_pred)
rf_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

rf_test_accuracy = accuracy_score(y_test, y_test_pred)
rf_test_mcc = matthews_corrcoef(y_test, y_test_pred)
rf_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('RF Model performance for Training set')
print('- Accuracy: %s' % rf_train_accuracy)
print('- MCC: %s' % rf_train_mcc)
print('- F1 score: %s' % rf_train_f1)
print('----------------------------------')
print('RF Model performance for Test set')
print('- Accuracy: %s' % rf_test_accuracy)
print('- MCC: %s' % rf_test_mcc)
print('- F1 score: %s' % rf_test_f1)

RF Model performance for Training set
- Accuracy: 0.9894547902176352
- MCC: 0.9540836834302537
- F1 score: 0.9892792140048106
----------------------------------
RF Model performance for Test set
- Accuracy: 0.9820627802690582
- MCC: 0.9208523277239943
- F1 score: 0.9815690139456956


In [8]:
# Neural Network (MLP, with dropout to reduce overfitting)
mlp = MLPClassifier(
    hidden_layer_sizes=(50,),
    alpha=0.5,  # Increased from 0.1
    max_iter=1000,
    learning_rate='adaptive',
    random_state=42,
    early_stopping=True,  # Added to prevent overfitting
    validation_fraction=0.1
)
mlp.fit(X_train, y_train)

y_train_pred = mlp.predict(X_train)
y_test_pred = mlp.predict(X_test)

mlp_train_accuracy = accuracy_score(y_train, y_train_pred)
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred)
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

mlp_test_accuracy = accuracy_score(y_test, y_test_pred)
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred)
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('MLP Model performance for Training set')
print('- Accuracy: %s' % mlp_train_accuracy)
print('- MCC: %s' % mlp_train_mcc)
print('- F1 score: %s' % mlp_train_f1)
print('----------------------------------')
print('MLP Model performance for Test set')
print('- Accuracy: %s' % mlp_test_accuracy)
print('- MCC: %s' % mlp_test_mcc)
print('- F1 score: %s' % mlp_test_f1)

MLP Model performance for Training set
- Accuracy: 0.9721785954678035
- MCC: 0.8760669171000149
- F1 score: 0.97091543728438
----------------------------------
MLP Model performance for Test set
- Accuracy: 0.9614349775784753
- MCC: 0.8245458936830266
- F1 score: 0.959063784276791


In [9]:
# XGBoost with expanded regularization
xgb_clf = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=5,  # Added to reduce overfitting
    gamma=1,             # Added for regularization
    scale_pos_weight=4825/747,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)
xgb_clf.fit(X_train, y_train)

y_train_pred = xgb_clf.predict(X_train)
y_test_pred = xgb_clf.predict(X_test)

xgb_train_accuracy = accuracy_score(y_train, y_train_pred)
xgb_train_mcc = matthews_corrcoef(y_train, y_train_pred)
xgb_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

xgb_test_accuracy = accuracy_score(y_test, y_test_pred)
xgb_test_mcc = matthews_corrcoef(y_test, y_test_pred)
xgb_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('XGBoost Model performance for Training set')
print('- Accuracy: %s' % xgb_train_accuracy)
print('- MCC: %s' % xgb_train_mcc)
print('- F1 score: %s' % xgb_train_f1)
print('----------------------------------')
print('XGBoost Model performance for Test set')
print('- Accuracy: %s' % xgb_test_accuracy)
print('- MCC: %s' % xgb_test_mcc)
print('- F1 score: %s' % xgb_test_f1)

Parameters: { "use_label_encoder" } are not used.



XGBoost Model performance for Training set
- Accuracy: 0.9831725375813327
- MCC: 0.929674573327478
- F1 score: 0.983362326276744
----------------------------------
XGBoost Model performance for Test set
- Accuracy: 0.9748878923766816
- MCC: 0.8904115205851921
- F1 score: 0.9747430039725188


In [10]:
# Stacking Classifier with calibration
estimator_list = [
    ('svc', svc),
    ('dt', dt),
    ('rf', rf),
    ('mlp', mlp),
    ('xgb', xgb_clf)
]

stack_model = StackingClassifier(
    estimators=estimator_list,
    final_estimator=LogisticRegression(C=0.5),  # Relaxed from 0.1
    cv=5
)
stack_model = CalibratedClassifierCV(stack_model, cv=5, method='sigmoid')
print("Training the stacked model with calibration...")
stack_model.fit(X_train, y_train)

y_train_pred = stack_model.predict(X_train)
y_test_pred = stack_model.predict(X_test)

stack_model_train_accuracy = accuracy_score(y_train, y_train_pred)
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred)
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

stack_model_test_accuracy = accuracy_score(y_test, y_test_pred)
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred)
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

print('Stacked Model performance for Training set')
print('- Accuracy: %s' % stack_model_train_accuracy)
print('- MCC: %s' % stack_model_train_mcc)
print('- F1 score: %s' % stack_model_train_f1)
print('----------------------------------')
print('Stacked Model performance for Test set')
print('- Accuracy: %s' % stack_model_test_accuracy)
print('- MCC: %s' % stack_model_test_mcc)
print('- F1 score: %s' % stack_model_test_f1)

Training the stacked model with calibration...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encode

Stacked Model performance for Training set
- Accuracy: 0.9925959165357864
- MCC: 0.9679962477095162
- F1 score: 0.9925774761054852
----------------------------------
Stacked Model performance for Test set
- Accuracy: 0.9829596412556054
- MCC: 0.9248528519166247
- F1 score: 0.9826279487090812


In [11]:
# Compile results
acc_train_list = {
    'svc': svc_train_accuracy,
    'dt': dt_train_accuracy,
    'rf': rf_train_accuracy,
    'mlp': mlp_train_accuracy,
    'xgb': xgb_train_accuracy,
    'stack': stack_model_train_accuracy
}

mcc_train_list = {
    'svc': svc_train_mcc,
    'dt': dt_train_mcc,
    'rf': rf_train_mcc,
    'mlp': mlp_train_mcc,
    'xgb': xgb_train_mcc,
    'stack': stack_model_train_mcc
}

f1_train_list = {
    'svc': svc_train_f1,
    'dt': dt_train_f1,
    'rf': rf_train_f1,
    'mlp': mlp_train_f1,
    'xgb': xgb_train_f1,
    'stack': stack_model_train_f1
}

acc_test_list = {
    'svc': svc_test_accuracy,
    'dt': dt_test_accuracy,
    'rf': rf_test_accuracy,
    'mlp': mlp_test_accuracy,
    'xgb': xgb_test_accuracy,
    'stack': stack_model_test_accuracy
}

mcc_test_list = {
    'svc': svc_test_mcc,
    'dt': dt_test_mcc,
    'rf': rf_test_mcc,
    'mlp': mlp_test_mcc,
    'xgb': xgb_test_mcc,
    'stack': stack_model_test_mcc
}

f1_test_list = {
    'svc': svc_test_f1,
    'dt': dt_test_f1,
    'rf': rf_test_f1,
    'mlp': mlp_test_f1,
    'xgb': xgb_test_f1,
    'stack': stack_model_test_f1
}

# Training results
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])
train_df = pd.concat([acc_df, mcc_df, f1_df], axis=1)
print("Training Results:\n", train_df)

# Test results
acc_test_df = pd.DataFrame.from_dict(acc_test_list, orient='index', columns=['Accuracy'])
mcc_test_df = pd.DataFrame.from_dict(mcc_test_list, orient='index', columns=['MCC'])
f1_test_df = pd.DataFrame.from_dict(f1_test_list, orient='index', columns=['F1'])
test_df = pd.concat([acc_test_df, mcc_test_df, f1_test_df], axis=1)
print("Test Results:\n", test_df)

# Save results locally
train_df.to_csv('/content/train_results_final_balanced.csv')
test_df.to_csv('/content/test_results_final_balanced.csv')

Training Results:
        Accuracy       MCC        F1
svc    0.991025  0.962460  0.991112
dt     0.945030  0.768446  0.945577
rf     0.989455  0.954084  0.989279
mlp    0.972179  0.876067  0.970915
xgb    0.983173  0.929675  0.983362
stack  0.992596  0.967996  0.992577
Test Results:
        Accuracy       MCC        F1
svc    0.976682  0.897800  0.976478
dt     0.948879  0.783799  0.949373
rf     0.982063  0.920852  0.981569
mlp    0.961435  0.824546  0.959064
xgb    0.974888  0.890412  0.974743
stack  0.982960  0.924853  0.982628


In [12]:
# Save the stacked model locally
joblib.dump(stack_model, '/content/stack_final_balanced.sav')

# Load and predict
loaded_model = joblib.load('/content/stack_final_balanced.sav')
A = vectorizer.transform(data.Text)
dd = loaded_model.predict(A)
tac = dd[14:20]
p = 10

for i in tac:
    if i == 1:
        print("Spam message:", data['Text'][p], "\n")
    else:
        print("Legitimate message:", data['Text'][p], "\n")
    p += 1

# Download results and model
from google.colab import files
#files.download('/content/train_results_final_balanced.csv')
#files.download('/content/test_results_final_balanced.csv')
files.download('/content/stack_final_balanced.sav')

Legitimate message: I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today. 

Spam message: SIX chances to win CASH! From 100 to 20,000 pounds txt> CSH11 and send to 87575. Cost 150p/day, 6days, 16+ TsandCs apply Reply HL 4 info 

Legitimate message: URGENT! You have won a 1 week FREE membership in our å£100,000 Prize Jackpot! Txt the word: CLAIM to No: 81010 T&C www.dbuk.net LCCLTD POBOX 4403LDNW1A7RW18 

Legitimate message: I've been searching for the right words to thank you for this breather. I promise i wont take your help for granted and will fulfil my promise. You have been wonderful and a blessing at all times. 

Legitimate message: I HAVE A DATE ON SUNDAY WITH WILL!! 

Spam message: XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap. xxxmobilemovieclub.com?n=QJKGIGHJJGCBL 



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.calibration import CalibratedClassifierCV
import xgboost as xgb
import joblib
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')

# Load and preprocess data
data = pd.read_csv('1.csv', encoding='latin')
data.rename(columns={'v1': 'Class', 'v2': 'Text'}, inplace=True)
data['numClass'] = data['Class'].map({'ham': 0, 'spam': 1})

# Vectorize text
stopset = list(stopwords.words("english"))
vectorizer = TfidfVectorizer(stop_words=stopset)
X = vectorizer.fit_transform(data.Text)
y = data.numClass

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Define base models (using your original parameters)
svc = LinearSVC(C=0.1, class_weight='balanced', max_iter=10000)
dt = DecisionTreeClassifier(max_depth=5, class_weight='balanced')
rf = RandomForestClassifier(n_estimators=50, max_depth=30, min_samples_split=5, class_weight='balanced')
mlp = MLPClassifier(hidden_layer_sizes=(50,), alpha=0.5, max_iter=1000, learning_rate='adaptive', random_state=42, early_stopping=True, validation_fraction=0.1)
xgb_clf = xgb.XGBClassifier(n_estimators=100, max_depth=6, learning_rate=0.1, subsample=0.8, colsample_bytree=0.8, min_child_weight=5, gamma=1, scale_pos_weight=4825/747, random_state=42, eval_metric='logloss')

# Train base models
svc.fit(X_train, y_train)
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
mlp.fit(X_train, y_train)
xgb_clf.fit(X_train, y_train)

# Define and train stacking model
estimator_list = [('svc', svc), ('dt', dt), ('rf', rf), ('mlp', mlp), ('xgb', xgb_clf)]
stack_model = StackingClassifier(estimators=estimator_list, final_estimator=LogisticRegression(C=0.5), cv=5)
stack_model = CalibratedClassifierCV(stack_model, cv=5, method='sigmoid')
stack_model.fit(X_train, y_train)

# Save the model and vectorizer
joblib.dump(stack_model, '/content/stack_final_balanced.sav')
joblib.dump(vectorizer, '/content/vectorizer.sav')  # Save vectorizer for future use

print("Model and vectorizer saved successfully!")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Model and vectorizer saved successfully!


In [17]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved model and vectorizer
loaded_model = joblib.load('/content/stack_final_balanced.sav')
vectorizer = joblib.load('/content/vectorizer.sav')  # Load the saved vectorizer

# Custom statement
custom_statement = "hello world"

# Transform and predict
custom_transformed = vectorizer.transform([custom_statement])
prediction = loaded_model.predict(custom_transformed)

# Output result
if prediction[0] == 1:
    print(f"Prediction for '{custom_statement}': Spam")
else:
    print(f"Prediction for '{custom_statement}': Legitimate (Ham)")

Prediction for 'hello world': Legitimate (Ham)


In [19]:
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the saved model and vectorizer
loaded_model = joblib.load('/content/stack_final_balanced.sav')
vectorizer = joblib.load('/content/vectorizer.sav')  # Assuming you saved it during training

# Get user input
custom_statement = input("Enter a message to classify (e.g., 'Win a free iPhone now!'): ")

# Transform the user input using the loaded vectorizer
custom_transformed = vectorizer.transform([custom_statement])

# Predict using the loaded model
prediction = loaded_model.predict(custom_transformed)

# Interpret and display the result
if prediction[0] == 1:
    print(f"Prediction for '{custom_statement}': Spam")
else:
    print(f"Prediction for '{custom_statement}': Legitimate (Ham)")

Enter a message to classify (e.g., 'Win a free iPhone now!'): 'Win a free iPhone now!
Prediction for ''Win a free iPhone now!': Spam
