# **Assignment 2**

---



# **Imported Libraries**

---



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import string
import math
import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, cross_validate, validation_curve, learning_curve
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn import metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin

import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer

%matplotlib inline
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets_json')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package tagsets_json to /root/nltk_data...
[nltk_data]   Unzipping help/tagsets_json.zip.


True

# **Task 1 : Data Loading and Preprocessing Pipeline**

---



In [2]:
import pandas as pd
df_train = pd.read_excel('AI_vs_huam_train_dataset.xlsx')
df_test = pd.read_excel('AI_vs_huam_train_dataset.xlsx')

In [3]:
df_train.head()

Unnamed: 0,essay,label
0,International sports events require the most w...,0
1,Globalisation has become a significant aspect ...,0
2,There is an ever-increasing number of bullying...,0
3,"It is commonly believed, that companies should...",0
4,Despite knowing about the adverse effects of c...,0


In [4]:
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

def text_process(essay):
    """
    Cleans and processes an essay string.

    This function takes a raw essay string and performs several text cleaning steps:
    1. Checks if the input is a string; returns an empty string otherwise.
    2. Defines a list of common English stopwords and additional custom words to remove.
    3. Removes all punctuation from the essay.
    4. Removes pure numerical values and mixed alphanumeric tokens.
    5. Removes URLs and emails.
    6. Removes the defined stopwords from the essay.
    7. Applies lemmatization to normalize words.
    8. Joins the remaining words back into a single cleaned string.

    Args:
        essay (str): The input essay string to be processed.

    Returns:
        str: The cleaned essay string with punctuation and stopwords removed,
             or an empty string if the input was not a string.
    """

    if not isinstance(essay, str):
        return ""

    STOPWORDS = stopwords.words('english') + ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure']
    lemmatizer = WordNetLemmatizer()

    nopunc = ''.join([char for char in essay if char not in string.punctuation])


    cleaned_text = re.sub(r'\S*\d\S*', '', nopunc)
    cleaned_text = re.sub(r'http\S+|www.\S+', '', cleaned_text)
    cleaned_text = re.sub(r'\S*@\S*\s?', '', cleaned_text)

    return ' '.join([lemmatizer.lemmatize(word.lower()) for word in cleaned_text.split() if word.lower() not in STOPWORDS])

In [5]:
df_train['clean_essays'] = df_train['essay'].apply(text_process)
df_test['clean_essays'] = df_test['essay'].apply(text_process)

binary_mapping = {0: 0, 1: 1}

df_train['label_num'] = df_train['label'].map(binary_mapping)
df_test['label_num'] = df_test['label'].map(binary_mapping)

In [6]:
train_vocab = set(" ".join(df_train['clean_essays']).split())
test_vocab = set(" ".join(df_test['clean_essays']).split())

In [7]:
X = df_train['clean_essays']
y = df_train['label']



In [8]:
print("Sample train words :", sorted(list(train_vocab))[:20], "...\n")

Sample train words : ['aa', 'aaaas', 'aalready', 'abandon', 'abandond', 'abandoned', 'abandoning', 'abandonned', 'abati', 'abati‚äôs', 'abbbandon', 'abbility', 'abbreviated', 'abbreviation', 'abceventhough', 'abd', 'abdicate', 'abdul', 'abdulazez', 'abdulaziz'] ...



In [9]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# **Task 2 : Feature Extraction with Pipeline Integration**

---



In [10]:
vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.95,
    lowercase=False
)

X_train_tfidf = vectorizer.fit_transform(X_train)
print(f"TF-IDF matrix shape: {X_train_tfidf.shape}")

configs = {
    'n-gram_1': {'max_features': 1000, 'ngram_range': (1, 1)},
    'n-gram_2': {'max_features': 2000, 'ngram_range': (1, 2)},
    'n-gram_3': {'max_features': 3000, 'ngram_range': (1, 3)}
}

for name, config in configs.items():
    test_vec = TfidfVectorizer(lowercase=False, **config)
    test_matrix = test_vec.fit_transform(X_train)
    print(f"{name}: Shape {test_matrix.shape}")


TF-IDF matrix shape: (2982, 5000)
n-gram_1: Shape (2982, 1000)
n-gram_2: Shape (2982, 2000)
n-gram_3: Shape (2982, 3000)


# **Task 3 : Advanced Model Development with GridSearchCV**

---



In [11]:
svm_param_grid = {
    'vectorizer__max_features': [1000],
    'vectorizer__ngram_range': [(1,1), (1,2)],
    'classifier__C': [1, 10],
    'classifier__kernel': ['linear'],
    'classifier__gamma': ['scale']
}

In [12]:
svm_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=False)),
    ('classifier', SVC(random_state=42, probability=True))
])

In [13]:
svm_grid_search = GridSearchCV(
    estimator=svm_pipeline,
    param_grid=svm_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [14]:
svm_grid_search.fit(X_train, y_train)
print(f"SVM Best parameters: {svm_grid_search.best_params_}")
print(f"SVM Best CV score: {svm_grid_search.best_score_:.4f}")

svm_val_score = svm_grid_search.score(X_val, y_val)
print(f"SVM Validation accuracy: {svm_val_score:.4f}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
SVM Best parameters: {'classifier__C': 1, 'classifier__gamma': 'scale', 'classifier__kernel': 'linear', 'vectorizer__max_features': 1000, 'vectorizer__ngram_range': (1, 1)}
SVM Best CV score: 0.9745
SVM Validation accuracy: 0.9745


In [15]:
dt_param_grid = {
    'vectorizer__max_features': [1000],
    'vectorizer__ngram_range': [(1,1), (1,2)],
    'classifier__criterion': ['gini'],
    'classifier__max_depth': [10, None],
    'classifier__min_samples_split': [2],
    'classifier__min_samples_leaf': [1]
}


In [16]:
dt_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=False)),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

In [17]:
dt_grid_search = GridSearchCV(
    estimator=dt_pipeline,
    param_grid=dt_param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)

In [18]:
dt_grid_search.fit(X_train, y_train)
print(f"DT Best parameters: {dt_grid_search.best_params_}")
print(f"DT Best CV score: {dt_grid_search.best_score_:.4f}")

dt_val_score = dt_grid_search.score(X_val, y_val)
print(f"DT Validation accuracy: {dt_val_score:.4f}")

Fitting 5 folds for each of 4 candidates, totalling 20 fits
DT Best parameters: {'classifier__criterion': 'gini', 'classifier__max_depth': 10, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'vectorizer__max_features': 1000, 'vectorizer__ngram_range': (1, 2)}
DT Best CV score: 0.9222
DT Validation accuracy: 0.9062


# **Task 4 : Cross-Validation Implementation**

---





In [19]:
best_svm = svm_grid_search.best_estimator_
best_dt = dt_grid_search.best_estimator_

models = {
    'SVM (Optimized)': best_svm,
    'Decision Tree (Optimized)': best_dt
}

cv_results = {}
for name, model in models.items():
    cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy')
    cv_results[name] = cv_scores
    print(f"\n{name} CV Scores: {cv_scores}")
    print(f"Mean: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

stratified_results = {}
for name, model in models.items():
    stratified_scores = cross_val_score(model, X_train, y_train, cv=skf, scoring='accuracy')
    stratified_results[name] = stratified_scores

    regular_mean = cv_results[name].mean()
    stratified_mean = stratified_scores.mean()
    print(f"\n{name} Stratified CV: {stratified_mean:.4f}")
    print(f"Regular vs Stratified difference: {abs(stratified_mean - regular_mean):.4f}")


SVM (Optimized) CV Scores: [0.97822446 0.98157454 0.98154362 0.97483221 0.95637584]
Mean: 0.9745 (+/- 0.0188)

Decision Tree (Optimized) CV Scores: [0.92629816 0.92629816 0.91778523 0.91778523 0.92281879]
Mean: 0.9222 (+/- 0.0076)

SVM (Optimized) Stratified CV: 0.9765
Regular vs Stratified difference: 0.0020

Decision Tree (Optimized) Stratified CV: 0.9101
Regular vs Stratified difference: 0.0121


# **Task 5 : ML Pipeline Implementation**

---



In [20]:
svm_pipeline = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', SVC())])

dt_pipeline = Pipeline([('vectorizer', TfidfVectorizer()), ('classifier', DecisionTreeClassifier())])

svm_pipeline.fit(X_train, y_train)
dt_pipeline.fit(X_train, y_train)

svm_basic_score = svm_pipeline.score(X_val, y_val)
dt_basic_score = dt_pipeline.score(X_val, y_val)

print(f"SVM Pipeline - Basic: {svm_basic_score:.4f}, Optimized: {svm_val_score:.4f}")
print(f"DT Pipeline - Basic: {dt_basic_score:.4f}, Optimized: {dt_val_score:.4f}")

SVM Pipeline - Basic: 0.9491, Optimized: 0.9745
DT Pipeline - Basic: 0.9196, Optimized: 0.9062


# **Task 6 : Models Evaluation and Analysis**

---



In [21]:
best_models = {
    'SVM': best_svm,
    'Decision Tree': best_dt
}

for name, model in best_models.items():
    print(f"\n{name} Performance:")
    y_pred = model.predict(X_val)

    print(f"Accuracy: {accuracy_score(y_val, y_pred):.4f}")
    print(f"Precision (macro): {precision_score(y_val, y_pred, average='macro'):.4f}")
    print(f"Recall (macro): {recall_score(y_val, y_pred, average='macro'):.4f}")
    print(f"F1-score (macro): {f1_score(y_val, y_pred, average='macro'):.4f}")
    print(f"F1-score (weighted): {f1_score(y_val, y_pred, average='weighted'):.4f}")

    if hasattr(model, 'predict_proba'):
        y_prob = model.predict_proba(X_val)[:, 1]
        print(f"ROC-AUC: {roc_auc_score(y_val, y_prob):.4f}")

    print("Confusion Matrix:")
    print(confusion_matrix(y_val, y_pred))

comparison_data = []
for name, model in best_models.items():
    y_pred = model.predict(X_val)

    comparison_data.append({
        'Model': name,
        'Accuracy': accuracy_score(y_val, y_pred),
        'Precision': precision_score(y_val, y_pred, average='weighted'),
        'Recall': recall_score(y_val, y_pred, average='weighted'),
        'F1-Score': f1_score(y_val, y_pred, average='weighted')
    })

comparison_df = pd.DataFrame(comparison_data)
print(f"\nModel Comparison:")
print(comparison_df.round(4))

best_model_name = comparison_df.loc[comparison_df['Accuracy'].idxmax(), 'Model']
print(f"Best performing model: {best_model_name}")

if 'Decision Tree' in best_models:
    dt_model = best_models['Decision Tree']
    if hasattr(dt_model.named_steps['classifier'], 'feature_importances_'):
        print(f"\nDecision Tree Feature Importance (Top 10):")
        feature_names = dt_model.named_steps['vectorizer'].get_feature_names_out()
        importances = dt_model.named_steps['classifier'].feature_importances_

        top_indices = importances.argsort()[-10:][::-1]
        for i, idx in enumerate(top_indices, 1):
            print(f"{i:2d}. {feature_names[idx]:<15}: {importances[idx]:.4f}")

best_model = best_models[best_model_name]
y_pred = best_model.predict(X_val)

misclassified_mask = y_val != y_pred
print(f"\nError Analysis:")
print(f"Total misclassified: {sum(misclassified_mask)}")
print(f"Error rate: {sum(misclassified_mask) / len(y_val):.4f}")

try:
    df_test = pd.read_csv('Final_test_data.csv')
    df_test['clean_text'] = df_test['essay'].apply(text_process)
    test_predictions = best_model.predict(df_test['clean_text'])

    results_df = pd.DataFrame({
        'essay_id': range(len(df_test)),
        'predicted_label': test_predictions
    })

    results_df.to_csv('Natalia_DelRio_Assignment2_R11859714.csv', index=False)

    from google.colab import files
    files.download('Natalia_DelRio_Assignment2_R11859714.csv')

except Exception as e:
    print(f"Error: {e}")


SVM Performance:
Accuracy: 0.9745
Precision (macro): 0.9746
Recall (macro): 0.9745
F1-score (macro): 0.9745
F1-score (weighted): 0.9745
ROC-AUC: 0.9977
Confusion Matrix:
[[366   7]
 [ 12 361]]

Decision Tree Performance:
Accuracy: 0.9062
Precision (macro): 0.9062
Recall (macro): 0.9062
F1-score (macro): 0.9062
F1-score (weighted): 0.9062
ROC-AUC: 0.9010
Confusion Matrix:
[[336  37]
 [ 33 340]]

Model Comparison:
           Model  Accuracy  Precision  Recall  F1-Score
0            SVM    0.9745     0.9746  0.9745    0.9745
1  Decision Tree    0.9062     0.9062  0.9062    0.9062
Best performing model: SVM

Decision Tree Feature Importance (Top 10):
 1. äôs            : 0.5097
 2. äôt            : 0.1109
 3. additionally   : 0.0761
 4. often          : 0.0474
 5. believe        : 0.0204
 6. sticking       : 0.0153
 7. risk           : 0.0130
 8. order          : 0.0124
 9. ultimately     : 0.0083
10. it äôs         : 0.0077

Error Analysis:
Total misclassified: 19
Error rate: 0.0255


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [22]:
# Save the best performing model to a pickle file
joblib.dump(best_model, 'best_model.pkl')

print("Best model saved to best_model.pkl")

Best model saved to best_model.pkl


In [23]:
# Save the trained Decision Tree classifier to a pickle file
joblib.dump(best_dt, 'decision_tree_model.pkl')

print("Decision Tree model saved to decision_tree_model.pkl")

Decision Tree model saved to decision_tree_model.pkl


In [24]:
# Save the trained SVM classifier to a pickle file
joblib.dump(best_svm, 'svm_model.pkl')

print("SVM model saved to svm_model.pkl")

SVM model saved to svm_model.pkl


In [25]:
from sklearn.ensemble import AdaBoostClassifier

adaboost_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(lowercase=False, max_features=1000, ngram_range=(1,2))),
    ('classifier', AdaBoostClassifier(random_state=42))
])

In [26]:
adaboost_pipeline.fit(X_train, y_train)

In [27]:
y_pred_adaboost = adaboost_pipeline.predict(X_val)
accuracy_adaboost = accuracy_score(y_val, y_pred_adaboost)
print(f"AdaBoost Validation accuracy: {accuracy_adaboost:.4f}")

AdaBoost Validation accuracy: 0.9437


In [28]:
joblib.dump(adaboost_pipeline, 'adaboost_model.pkl')
print("AdaBoost model saved to adaboost_model.pkl")

AdaBoost model saved to adaboost_model.pkl


In [29]:
joblib.dump(adaboost_pipeline, 'adaboost_model.pkl')
print("AdaBoost model saved to adaboost_model.pkl")

AdaBoost model saved to adaboost_model.pkl


In [30]:
# Extract and save the TfidfVectorizer from the best SVM pipeline
tfidf_vectorizer = best_svm.named_steps['vectorizer']
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

print("TF-IDF vectorizer saved to tfidf_vectorizer.pkl")

TF-IDF vectorizer saved to tfidf_vectorizer.pkl


In [31]:
import os
import joblib

# Create the 'models' directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')
    print("Created 'models' directory")
else:
    print("'models' directory already exists")

# Save the trained SVM model
joblib.dump(best_svm, 'models/svm_model.pkl')
print("SVM model saved as 'models/svm_model.pkl'")

# Save the trained Decision Tree model
joblib.dump(best_dt, 'models/decision_tree_model.pkl')
print("Decision Tree model saved as 'models/decision_tree_model.pkl'")

# Save the trained AdaBoost model
joblib.dump(adaboost_pipeline, 'models/adaboost_model.pkl')
print("AdaBoost model saved as 'models/adaboost_model.pkl'")

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.pkl')
print("TF-IDF vectorizer saved as 'models/tfidf_vectorizer.pkl'")

Created 'models' directory
SVM model saved as 'models/svm_model.pkl'
Decision Tree model saved as 'models/decision_tree_model.pkl'
AdaBoost model saved as 'models/adaboost_model.pkl'
TF-IDF vectorizer saved as 'models/tfidf_vectorizer.pkl'
