In [48]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer

# Import various classifiers
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier

### Reading Dataframe

In [49]:
clean_avg = pd.read_csv('aidata_clean_avg.csv')
clean_avg.head()

Unnamed: 0,Question,Human,AI,Question_length,Human_length,AI_length,Question_special_count,Human_special_count,AI_special_count,avg_special_char_diff
0,1) Write short note about Transition Probabili...,Transition Probability Matrix P is used in Mar...,A transition probability matrix is a mathemati...,56,388,629,2,46,9,29.333333
1,2) Write short note about Bayes' Rule,P (B|A) = P(A|B)P(B) / P(A)\n = P(A...,Bayes' Rule is a fundamental theorem in probab...,37,131,792,2,39,38,24.666667
2,3) What is the meaning of outcome in probability?,The outcome of probability is the possible res...,"In probability, an outcome refers to a specifi...",49,133,610,2,3,16,9.333333
3,4) How do we transform a process to a Markov c...,The state of the system at time t+1 depends on...,Transforming a process into a Markov chain inv...,51,203,1357,2,58,22,37.333333
4,5) Write short note about Continuous probabili...,A probability distribution in which the random...,A continuous probability distribution is a typ...,63,170,977,2,14,16,9.333333


### Preprocessing

In [34]:
def preprocess_text(text):
        # Lowercase
        text = text.lower()
        # Remove punctuation and special characters
        #text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Tokenize
        tokens = word_tokenize(text)
        stop_words = set(stopwords.words('english'))
        # Remove stopwords
        #filtered_tokens = [word for word in tokens if word not in stop_words]
        return " ".join(tokens)
def generic_preprocessing(df):
    human = df.drop(columns=['Question', 'AI']).rename(columns={'Human': 'Response'})
    human['Is_AI'] = 0
    ai = df.drop(columns=['Question', 'Human']).rename(columns={'AI': 'Response'})
    ai['Is_AI'] = 1
    data = pd.concat([human, ai], ignore_index=True)
    # Apply text preprocessing to a text column
    data['processed_text'] = data['Response'].apply(preprocess_text)
    return data


In [35]:
processed= generic_preprocessing(clean_avg)

In [36]:
'''from sklearn.feature_extraction.text import CountVectorizer

# Create an instance of CountVectorizer
vectorizer = CountVectorizer()

# Assume processed['processed_text'] is a pandas Series containing your text data
X_text = processed['processed_text']

# Fit the vectorizer to the text and transform it into token count vectors
X_vectorized = vectorizer.fit_transform(X_text)

# X_vectorized is now a sparse matrix of shape (n_samples, n_features)
print("Shape of vectorized text data:", X_vectorized.shape)'''

Shape of vectorized text data: (3986, 5891)


### Train Test Split

In [42]:
# 80% 20% split of data
X = processed['processed_text']
y = processed['Is_AI']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set shape:", X_train.shape, y_train.shape)
print("Testing set shape:", X_test.shape, y_test.shape)

Training set shape: (3188,) (3188,)
Testing set shape: (798,) (798,)


### Model

In [51]:

# Define a dictionary of classification models
classifiers = {
    'Logistic Regression': LogisticRegression(solver='liblinear'),
    'Naive Bayes': MultinomialNB(),
    'SVM': LinearSVC(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Neural Network': MLPClassifier(max_iter=300),
    'MLPClassifier' : MLPClassifier(hidden_layer_sizes=(100,), max_iter=300, random_state=42)
}

# Loop through models, build pipelines and evaluate
for name, clf in classifiers.items():
    pipeline = Pipeline([
        ('vect', CountVectorizer()),  # Tokenize and vectorize text
        ('clf', clf)                  # classifier
    ])
    
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Make predictions on the test data
    predictions = pipeline.predict(X_test)
    train_pred= pipeline.predict(X_train)
    
    # Generate and print the classification report for the current model
    report = classification_report(y_test, predictions)
    tain_report = classification_report(y_train, train_pred)
    print("="*40)
    print(f"{name} Classification Report:")
    print("-"*10)
    print("Test Report")
    print(report)
    print("-"*10)
    print("Train Report")
    print(tain_report)


Logistic Regression Classification Report:
----------
Test Report
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       416
           1       0.86      0.86      0.86       382

    accuracy                           0.86       798
   macro avg       0.86      0.86      0.86       798
weighted avg       0.86      0.86      0.86       798

----------
Train Report
              precision    recall  f1-score   support

           0       0.97      0.99      0.98      1577
           1       0.99      0.97      0.98      1611

    accuracy                           0.98      3188
   macro avg       0.98      0.98      0.98      3188
weighted avg       0.98      0.98      0.98      3188

Naive Bayes Classification Report:
----------
Test Report
              precision    recall  f1-score   support

           0       0.92      0.73      0.81       416
           1       0.76      0.93      0.84       382

    accuracy                      



SVM Classification Report:
----------
Test Report
              precision    recall  f1-score   support

           0       0.85      0.83      0.84       416
           1       0.82      0.85      0.83       382

    accuracy                           0.84       798
   macro avg       0.84      0.84      0.84       798
weighted avg       0.84      0.84      0.84       798

----------
Train Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1577
           1       1.00      1.00      1.00      1611

    accuracy                           1.00      3188
   macro avg       1.00      1.00      1.00      3188
weighted avg       1.00      1.00      1.00      3188

Decision Tree Classification Report:
----------
Test Report
              precision    recall  f1-score   support

           0       0.77      0.73      0.75       416
           1       0.72      0.76      0.74       382

    accuracy                           0.74     

In [None]:
## Using Tensor flow & BERT (to be explored)
'''
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


# --- 3. Tokenize and Pad Sequences ---
# You can adjust num_words to control the vocabulary size.
num_words = 5000
tokenizer = Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(X_text)
X_seq = tokenizer.texts_to_sequences(X_text)

# Determine the maximum sequence length (or specify a fixed value)
max_length = max(len(seq) for seq in X_seq)
X_padded = pad_sequences(X_seq, maxlen=max_length, padding='post')

# --- 4. Train-Test Split (80/20) ---
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

# --- 5. Build the Keras Model ---
# The Embedding layer converts integer sequences into dense vectors.
# GlobalAveragePooling1D is used to aggregate the sequence information.
num_classes = len(np.unique(y_encoded))

model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=50, input_length=max_length))
model.add(GlobalAveragePooling1D())
model.add(Dense(10, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# --- 6. Train the Model ---
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.1)

# --- 7. Evaluate the Model ---
loss, accuracy = model.evaluate(X_test, y_test)
print("Test Accuracy (Keras Model):", accuracy)

'''