## **1. Importing Necessary Libraries**


In [1]:
# Import necessary libraries for data manipulation, visualization, and machine learning
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import joblib
import string
import math
import time

# Sklearn libraries for machine learning and text processing
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import svm, datasets
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold, cross_validate, validation_curve, learning_curve
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn import metrics


# NLTK libraries for text processing (lemmatization, stemming, stopwords, POS tagging)
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize

# Set up visualization
%matplotlib inline
import matplotlib.pyplot as plt
sns.set_style("whitegrid")
plt.style.use("fivethirtyeight")

# Download necessary NLTK resources for text processing
nltk.download('wordnet')  # WordNet for lemmatization
nltk.download('omw-1.4')  # Open Multilingual Wordnet
nltk.download('punkt')  # Tokenizer
nltk.download('punkt_tab')
nltk.download('stopwords')  # Stopwords for text cleaning
nltk.download('averaged_perceptron_tagger')  # POS tagger for part-of-speech tagging
nltk.download('averaged_perceptron_tagger_eng')  # Additional tagger
nltk.download('tagsets_json')  # Tagset resource

print("\n All libraries imported successfully!")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data] Downloading package tagsets_json to /root/nltk_data...



 All libraries imported successfully!


[nltk_data]   Unzipping help/tagsets_json.zip.


## **2. Data Overview and Importing**


# Import Data

In [2]:
import pandas as pd
ds_train = pd.read_excel('AI_vs_huam_train_dataset.xlsx')
ds_test = pd.read_csv('Final_test_data.csv')

# 💾 Reading a text-based dataset into pandas

In [3]:
print("Dataset Information:")
print(f"Training data shape: {ds_train.shape}")
print(f"Test data shape: {ds_test.shape}")
print(f"Columns: {ds_train.columns.tolist()}")

# Show first few rows
print("\nFirst 5 rows of training data:")
print(ds_train.head())

# Check sentiment distribution
print("\nSentiment distribution:")
print(ds_train['label'].value_counts())

Dataset Information:
Training data shape: (3728, 2)
Test data shape: (869, 2)
Columns: ['essay', 'label']

First 5 rows of training data:
                                               essay  label
0  International sports events require the most w...      0
1  Globalisation has become a significant aspect ...      0
2  There is an ever-increasing number of bullying...      0
3  It is commonly believed, that companies should...      0
4  Despite knowing about the adverse effects of c...      0

Sentiment distribution:
label
0    1864
1    1864
Name: count, dtype: int64


## **3. Text Pre-processing**



In [4]:
#  Advanced Text Preprocessing with Lemmatization

class text_processing:

  def __init__(self):
    self.lemmatizer = WordNetLemmatizer()
    self.STOPWORDS = set(stopwords.words('english'))
    self.STOPWORDS.update(['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])



  def text_cleaning_and_normalization(self, essay):

    # Handle non-string values
    if not isinstance(essay, str):
        return ""
    #convert to lowercase
    essay = essay.lower()
    #remove numerical value and alphanumeric tokens
    essay = re.sub(r'\w*\d\w*', '', essay) #using re pattern
    #remove punnc
    nopunc = ''.join([char for char in essay if char not in string.punctuation])
    essay = ' '.join([word for word in nopunc.split() if word.lower() not in self.STOPWORDS])
    return essay
  #Lemmatization
  def lemmatization(self, essay):
    tokens = nltk.word_tokenize(essay)
    lemmetized_tokens = []
    for token in tokens:
      if token not in self.STOPWORDS:
        lemmetized_tokens.append(self.lemmatizer.lemmatize(token))
    return ' '.join(lemmetized_tokens)
  #preprocess
  def preprocess(self, essay):
    cleaned_essay = self.text_cleaning_and_normalization(essay)
    lemmatized_text = self.lemmatization(cleaned_essay)
    return lemmatized_text

print("Text preprocessing class created!")

Text preprocessing class created!


In [5]:
# Apply Text Preprocessing to the "text" column

# Initialize the preprocessor
preprocessor = text_processing()

print("Applying advanced text preprocessing with lemmatization...")

# Apply preprocessing to training data
ds_train['clean_essay'] = ds_train['essay'].apply(preprocessor.preprocess)

# Apply preprocessing to test data
ds_test['clean_essay'] = ds_test['essay'].apply(preprocessor.preprocess)

print("Text preprocessing completed!")

Applying advanced text preprocessing with lemmatization...
Text preprocessing completed!


In [6]:
# Show examples of preprocessing
ds_train.head()

Unnamed: 0,essay,label,clean_essay
0,International sports events require the most w...,0,international sport event require welltrained ...
1,Globalisation has become a significant aspect ...,0,globalisation become significant aspect world ...
2,There is an ever-increasing number of bullying...,0,everincreasing number bullying activity numero...
3,"It is commonly believed, that companies should...",0,commonly believed company dress code policy em...
4,Despite knowing about the adverse effects of c...,0,despite knowing adverse effect climate change ...


In [7]:
# Define features and target
X = ds_train['clean_essay']
y = ds_train['label']

print(f"Features (X): {len(X)} input text samples")
print(f"Target (y): {len(y)} labels")

Features (X): 3728 input text samples
Target (y): 3728 labels


In [8]:
# Split Data for Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)    # stratify=y Maintain label distribution

In [9]:
print("Data split completed:")
print(f"Training samples: {len(X_train)}")
print(f"Validation samples: {len(X_val)}")
print(f"Training label distribution: {np.bincount(y_train)}")
print(f"Validation label distribution: {np.bincount(y_val)}")

Data split completed:
Training samples: 2982
Validation samples: 746
Training label distribution: [1491 1491]
Validation label distribution: [373 373]


## **4. Representing Text as Numerical Data using TF-IDF vectorization**



In [10]:
# Transform Text to Numbers Using TF-IDF
# Create TF-IDF vectorizer with optimal parameters found through experimentation
tfidf_vectorizer = TfidfVectorizer(
    max_features=3728,        #limit vocab size to 3728
    binary=True,
    token_pattern=r'\b\w+\b', #pattern (includes words with 1 char + more)
    ngram_range=(1, 1), #minimum 1 max words 1
    max_df=0.9,         #Ignore terms appearing in more than 90% of documents
    min_df=2            #Ignore terms appearing in less than 2 documents
)

print("TF-IDF Vectorizer created with optimal parameters:")
print(f"- Max features: 3728")
print(f"- N-gram range: (1, 1) - unigrams only")
print(f"- Min document frequency: 2")
print(f"- Max document frequency: 0.95")

TF-IDF Vectorizer created with optimal parameters:
- Max features: 3728
- N-gram range: (1, 1) - unigrams only
- Min document frequency: 2
- Max document frequency: 0.95


In [11]:
# Fit TF-IDF on training data and transform both sets
print("Converting text to numerical features using TF-IDF...")

X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_val_tfidf = tfidf_vectorizer.transform(X_val)

print("TF-IDF transformation completed!")
print(f"Training matrix shape: {X_train_tfidf.shape}")
print(f"Validation matrix shape: {X_val_tfidf.shape}")

# Show some feature names
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Total features created: {len(feature_names)}")
print(f"Sample features: {list(feature_names[:15])}")

Converting text to numerical features using TF-IDF...
TF-IDF transformation completed!
Training matrix shape: (2982, 3728)
Validation matrix shape: (746, 3728)
Total features created: 3728
Sample features: ['abandon', 'ability', 'able', 'abroad', 'absence', 'absolute', 'absolutely', 'absorb', 'abundance', 'abuse', 'academic', 'academically', 'accept', 'acceptable', 'accepted']


# **5. Machine Learning Model Training


In [12]:
# Train Support Vector Machines (SVM)

print("Training Support Vector Machines (SVM)...")

# Create and train SVM with best parameters
svm_model = SVC(
    C=100,
    kernel='linear',
    gamma=0.1,
    random_state=42,
    probability=True
)

# lr_model = LogisticRegression(
#     C=1.0,                    # Regularization strength (found to be optimal)
#     solver='liblinear',       # Optimization algorithm
#     random_state=42,          # For reproducible results
#     max_iter=1000            # Maximum iterations
# )

# Train the model
start_time = time.time()
svm_model.fit(X_train_tfidf, y_train)
svm_train_time = time.time() - start_time

# Make predictions on validation set
svm_predictions = svm_model.predict(X_val_tfidf)
svm_accuracy = metrics.accuracy_score(y_val, svm_predictions)

print("Support Vector Machine (SVM) Results:")
print(f"Training time: {svm_train_time:.3f} seconds")
print(f"Validation accuracy: {svm_accuracy:.4f}")

Training Support Vector Machines (SVM)...
Support Vector Machine (SVM) Results:
Training time: 20.545 seconds
Validation accuracy: 0.9799


AdaBoostClassifier

In [13]:
from sklearn.ensemble import AdaBoostClassifier

#Train AdaBoostClassifier

print("Training AdaBoostClassifier...")

# Create and train AdaBoostClassifier with best parameters
ada_model = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=1.0,
    random_state=42
)

#Train the model
start_time = time.time()
ada_model.fit(X_train_tfidf, y_train)
ada_train_time = time.time() - start_time

#Make predictions on validation set
ada_predictions = ada_model.predict(X_val_tfidf)
ada_accuracy = metrics.accuracy_score(y_val, ada_predictions)


print("AdaBoostClassifier Results:")
print(f"Training time: {ada_train_time:.3f} seconds")
print(f"Validation accuracy: {ada_accuracy:.4f}")

Training AdaBoostClassifier...
AdaBoostClassifier Results:
Training time: 2.760 seconds
Validation accuracy: 0.9397


In [14]:
# Train Decision Tree

print("Training Decision Tree model...")

# Create and Decision Tree with best parameters
tree_model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=None,
    random_state=42
)
# nb_model = MultinomialNB(
#     alpha=1.0,               # Smoothing parameter (found to be optimal)
#     fit_prior=True           # Learn class prior probabilities
# )

# Train the model
start_time = time.time()
tree_model.fit(X_train_tfidf, y_train)
tree_train_time = time.time() - start_time

# Make predictions on validation set
tree_predictions = tree_model.predict(X_val_tfidf)
tree_accuracy = metrics.accuracy_score(y_val, tree_predictions)

print("Decision Tree Results:")
print(f"Training time: {tree_train_time:.3f} seconds")
print(f"Validation accuracy: {tree_accuracy:.4f}")

Training Decision Tree model...
Decision Tree Results:
Training time: 0.583 seconds
Validation accuracy: 0.9169


Compare Model Performance

In [15]:
#Compare Model Performance

print("Model Performance Comparison:")
print("=" * 40)
print(f"Support Vector Machine (SVM):      {svm_accuracy:.4f}")
print(f"Decision Tree:  {tree_accuracy:.4f}")
print(f"AdaBoostClassifier: {ada_accuracy:.4f}")

# Determine the best model based on accuracy
accuracies = {
    "SVM": svm_accuracy,
    "AdaBoostClassifier": ada_accuracy,
    "Decision Tree": tree_accuracy
}

# Find the model with the highest accuracy
best_model_name = max(accuracies, key=accuracies.get)
best_accuracy = accuracies[best_model_name]

# Map model names to actual model objects
model_map = {
    "SVM": svm_model,
    "AdaBoostClassifier": ada_model,
    "Decision Tree": tree_model
}
best_model = model_map[best_model_name]

# Output
print(f"\nBest Model: {best_model_name}")
print(f"Best Accuracy: {best_accuracy:.4f}")

Model Performance Comparison:
Support Vector Machine (SVM):      0.9799
Decision Tree:  0.9169
AdaBoostClassifier: 0.9397

Best Model: SVM
Best Accuracy: 0.9799


Evaluate Best Model with Cross-Validation

In [16]:
#Evaluate Best Model with Cross-Validation

print(f"Performing 5-fold cross-validation on {best_model_name}...")

# Perform cross-validation to get more reliable performance estimate
cv_scores = cross_val_score(
    best_model,
    X_train_tfidf,
    y_train,
    cv=5,              # 5-fold cross-validation
    scoring='accuracy'
)

print("Cross-Validation Results:")
print(f"CV Scores: {cv_scores}")
print(f"Mean CV Score: {cv_scores.mean():.4f}")
print(f"Standard Deviation: {cv_scores.std():.4f}")
print(f"95% Confidence Interval: {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

Performing 5-fold cross-validation on SVM...
Cross-Validation Results:
CV Scores: [0.98659966 0.98492462 0.98489933 0.98154362 0.97315436]
Mean CV Score: 0.9822
Standard Deviation: 0.0048
95% Confidence Interval: 0.9822 (+/- 0.0096)


Analyze Feature Importance (for SVM)

In [17]:
# Analyze Feature Importance (for SVM)

if best_model_name == "SVM":
    print("Analyzing feature importance for SVM...")

    # Get feature coefficients
    feature_coefficients = best_model.coef_.toarray().flatten() #converting the matrix to a dense format NumPy array before calling .argsort()
    feature_names = tfidf_vectorizer.get_feature_names_out()

    # Find most important AI features (indicate positive sentiment)
    top_AI_indices = feature_coefficients.argsort()[-10:][::-1]
    print("Top 10 features for AI:")
    for i, idx in enumerate(top_AI_indices, 1):
        print(f"  {i:2d}. {feature_names[idx]:<15} (coefficient: {feature_coefficients[idx]:+.4f})")

    # Find most important negative features (indicate negative sentiment)
    top_Human_indices = feature_coefficients.argsort()[:10]
    print("\nTop 10 features for Human:")
    for i, idx in enumerate(top_Human_indices, 1):
        print(f"  {i:2d}. {feature_names[idx]:<15} (coefficient: {feature_coefficients[idx]:+.4f})")

Analyzing feature importance for SVM...
Top 10 features for AI:
   1. äôs             (coefficient: +4.0939)
   2. additionally    (coefficient: +3.1959)
   3. often           (coefficient: +3.1352)
   4. äôt             (coefficient: +2.9403)
   5. it              (coefficient: +2.8576)
   6. believe         (coefficient: +2.5727)
   7. truly           (coefficient: +2.3580)
   8. summary         (coefficient: +2.1881)
   9. sticking        (coefficient: +2.0883)
  10. willing         (coefficient: +2.0531)

Top 10 features for Human:
   1. order           (coefficient: -2.4630)
   2. conclude        (coefficient: -2.0516)
   3. thus            (coefficient: -1.8747)
   4. nowadays        (coefficient: -1.8571)
   5. addition        (coefficient: -1.7752)
   6. argued          (coefficient: -1.7173)
   7. get             (coefficient: -1.5407)
   8. therefore       (coefficient: -1.4721)
   9. hence           (coefficient: -1.4654)
  10. indeed          (coefficient: -1.3491)


## **6. Model saving for Streamlit deployment**

1. For Support Vector Machine

In [18]:
# Create ML Pipeline for Deployment

print("Creating ML Pipeline for deployment...")

# Create a complete pipeline that includes preprocessing and prediction
if best_model_name == "SVM":
    final_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=3728,
            binary=True,
            token_pattern=r'\b\w+\b',
            ngram_range=(1, 1),
            max_df=0.9,
            min_df=2
        )),
        ('classifier', svm.SVC(C=100, kernel='rbf', gamma=0.1, probability=True
        ))
    ])
elif best_model_name == "AdaBoostClassifier":
    final_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=3728,
            binary=True,
            token_pattern=r'\b\w+\b',
            ngram_range=(1,1),
            max_df=0.9,
            min_df=2
        )),
        ('classifier', AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42))
    ])
else:
    final_pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(
            max_features=3728,
            binary=True,
            token_pattern=r'\b\w+\b',
            ngram_range=(1, 1),
            min_df=0.9,
            max_df=2,
            stop_words='english'
        )),
        ('classifier', DecisionTreeClassifier(max_features=2000, binary=True, ngram_range=(1,2))), ('classifier', DecisionTreeClassifier(criterion='entropy', max_depth=None))
    ])

print("Pipeline created with steps:")
for name, step in final_pipeline.steps:
    print(f"  {name}: {step.__class__.__name__}")

Creating ML Pipeline for deployment...
Pipeline created with steps:
  tfidf: TfidfVectorizer
  classifier: SVC


In [19]:
#Train Final Pipeline on All Training Data

print("Training final pipeline on all available training data...")

# Train pipeline on the complete training set
final_pipeline.fit(X, y)

# Test pipeline performance on validation set
pipeline_predictions = final_pipeline.predict(X_val)
pipeline_accuracy = metrics.accuracy_score(y_val, pipeline_predictions)

print("Final Pipeline Results:")
print(f"Validation accuracy: {pipeline_accuracy:.4f}")

# Show detailed classification report
print("\nDetailed Classification Report:")
target_names = ['Human', 'AI']
classification_report = metrics.classification_report(y_val, pipeline_predictions, target_names=target_names)
print(classification_report)

Training final pipeline on all available training data...
Final Pipeline Results:
Validation accuracy: 1.0000

Detailed Classification Report:
              precision    recall  f1-score   support

       Human       1.00      1.00      1.00       373
          AI       1.00      1.00      1.00       373

    accuracy                           1.00       746
   macro avg       1.00      1.00      1.00       746
weighted avg       1.00      1.00      1.00       746



In [20]:
#Save Models for Streamlit Deployment

print("Saving models for Streamlit deployment...")

# Save the complete pipeline (includes TF-IDF + classifier)
pipeline_filename = 'AI_vs_Human_Analyzer_pipeline.pkl'
joblib.dump(final_pipeline, pipeline_filename)

# Also save individual components for flexibility
tfidf_filename = 'tfidf_vectorizer.pkl'
model_filename = f'{best_model_name.lower().replace(" ", "_")}_model.pkl'

joblib.dump(tfidf_vectorizer, tfidf_filename)
joblib.dump(best_model, model_filename)

print("Models saved successfully!")
print(f"Complete pipeline: {pipeline_filename}")
print(f"TF-IDF vectorizer: {tfidf_filename}")
print(f"Best model: {model_filename}")

Saving models for Streamlit deployment...
Models saved successfully!
Complete pipeline: AI_vs_Human_Analyzer_pipeline.pkl
TF-IDF vectorizer: tfidf_vectorizer.pkl
Best model: svm_model.pkl


For AdaBoost

In [21]:
#Create and train the AdaBoost classifier
print("Creating AdaBoost classifier...")
ada_classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, random_state=42)

print("Training AdaBoost on existing TF-IDF features...")
ada_classifier.fit(X_train_tfidf, y_train)

#Test clasifier performance
ada_predictions = ada_classifier.predict(X_val_tfidf)
ada_accuracy = metrics.accuracy_score(y_val, ada_predictions)

print(f"\nAdaBoost Classifier Results:")
print(f"Validation accuracy: {ada_accuracy:.4f}")

#Show detailed classification report
print("\nDetailed Classification Report - AdaBoost:")
target_names = ['Human', 'AI']
ada_classification_report = metrics.classification_report(y_val, ada_predictions, target_names=target_names)
print(ada_classification_report)

#Save The AdaBoost Classifier
print("Saving AdaBoost Classifier")
print("="*60)

#Save the AdaBoost trained classifier
ada_model_filename = 'ada_model.pkl'
print("Saving AdaBoost classifier...")
joblib.dump(ada_classifier, ada_model_filename)

print(f"\nAdaBoost classifier saved as: {ada_model_filename}")

Creating AdaBoost classifier...
Training AdaBoost on existing TF-IDF features...

AdaBoost Classifier Results:
Validation accuracy: 0.9397

Detailed Classification Report - AdaBoost:
              precision    recall  f1-score   support

       Human       0.92      0.96      0.94       373
          AI       0.96      0.92      0.94       373

    accuracy                           0.94       746
   macro avg       0.94      0.94      0.94       746
weighted avg       0.94      0.94      0.94       746

Saving AdaBoost Classifier
Saving AdaBoost classifier...

AdaBoost classifier saved as: ada_model.pkl


**2. For Decision Tree classifier**

In [22]:
# Create and train the Decision Tree classifier
print("Creating Decision Tree classifier...")
tree_classifier = DecisionTreeClassifier(criterion='entropy', max_depth=None)

print("Training Decision Tree on existing TF-IDF features...")
tree_classifier.fit(X_train_tfidf, y_train)

# Test classifier performance
tree_predictions = tree_classifier.predict(X_val_tfidf)
tree_accuracy = metrics.accuracy_score(y_val, tree_predictions)

print(f"\nDecision Tree Results:")
print(f"Validation accuracy: {tree_accuracy:.4f}")

# Show detailed classification report
print("\nDetailed Classification Report - Decision Tree:")
target_names = ['Human', 'AI']
tree_classification_report = metrics.classification_report(y_val, tree_predictions, target_names=target_names)
print(tree_classification_report)

# Save The Decision Tree Classifier
print("Saving Decision Tree Classifier")
print("="*60)

# Save the Decision Tree trained classifier
tree_model_filename = 'tree_model.pkl'
print("Saving Decision Tree classifier...")
joblib.dump(tree_classifier, tree_model_filename)

print(f"\nDecision Tree classifier saved as: {tree_model_filename}")

Creating Decision Tree classifier...
Training Decision Tree on existing TF-IDF features...

Decision Tree Results:
Validation accuracy: 0.9169

Detailed Classification Report - Decision Tree:
              precision    recall  f1-score   support

       Human       0.92      0.91      0.92       373
          AI       0.91      0.92      0.92       373

    accuracy                           0.92       746
   macro avg       0.92      0.92      0.92       746
weighted avg       0.92      0.92      0.92       746

Saving Decision Tree Classifier
Saving Decision Tree classifier...

Decision Tree classifier saved as: tree_model.pkl


In [23]:
#Performance evaluation and comparisonsections that include accuracy metrics, confusion matrices, ROC curves, anddetailed analysis comparing the strengths and weaknesses of each model on your specific dataset.
