In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score

In [2]:
def train_and_evaluate(texts, labels, representation_methods):
    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)
    
    for name, vectorizer in representation_methods.items():
        model = make_pipeline(vectorizer, MultinomialNB())
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f"{name} Accuracy: {accuracy:.4f}")
        
    return model

In [3]:
def predict_text(model, text):
    prediction = model.predict([text])[0]
    prediction_prob = model.predict_proba([text])[0]
    confidence = prediction_prob[prediction]
    status = "AI-written" if prediction == 0 else "Human-written"
    print(f"Prediction: {status}, Confidence: {confidence:.4f}")

In [4]:
#training
mistral_df = pd.read_csv("mistral_data.csv")
texts = mistral_df['Text']
labels = mistral_df['LABEL']

In [5]:
representation_methods = {'Bag of Words': CountVectorizer()}
bow_model = train_and_evaluate(texts, labels, representation_methods)

Bag of Words Accuracy: 0.9833


In [6]:
representation_methods = {'TF-IDF': TfidfVectorizer()}
tfidf_model = train_and_evaluate(texts, labels, representation_methods)

TF-IDF Accuracy: 0.9333


In [7]:
representation_methods = {'N-grams': CountVectorizer(ngram_range=(1, 2))}
ngram_model = train_and_evaluate(texts, labels, representation_methods)

N-grams Accuracy: 0.9833


In [8]:
#test
sample_df = pd.read_csv('Generated_Data.csv')
test_texts = sample_df['Text']
test_labels = sample_df['LABEL']

In [9]:
bow_prediction = bow_model.predict(test_texts)
np.sum(bow_prediction[bow_prediction == test_labels])

30

In [10]:
tfidf_prediction = tfidf_model.predict(test_texts)
np.sum(tfidf_prediction[tfidf_prediction == test_labels])

30

In [11]:
ngram_prediction = ngram_model.predict(test_texts)
np.sum(ngram_prediction[ngram_prediction == test_labels])

30

## SVM

In [12]:
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [13]:
def train_svm_model(texts, labels, kernel='rbf'):
    model = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('svm', SVC(kernel=kernel, probability=True))
    ])

    X_train, X_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    print(classification_report(y_test, predictions))
    
    return model

In [14]:
svm_model = train_svm_model(texts, labels)

              precision    recall  f1-score   support

           0       0.99      1.00      0.99        66
           1       1.00      0.98      0.99        54

    accuracy                           0.99       120
   macro avg       0.99      0.99      0.99       120
weighted avg       0.99      0.99      0.99       120



In [15]:
svm_prediction = svm_model.predict(test_texts)
np.sum(svm_prediction[svm_prediction == test_labels])

30

## Random Forest

In [16]:
import spacy
import nltk
from textblob import TextBlob
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm 

In [17]:
nlp = spacy.load("en_core_web_sm")

def extract_features(doc):
    spacy_doc = nlp(doc)
    blob = TextBlob(doc)

    avg_sent_len = np.mean([len(sent) for sent in spacy_doc.sents])
    sentiment_polarity = blob.sentiment.polarity
    sentiment_subjectivity = blob.sentiment.subjectivity
    lexical_diversity = len(set([token.text for token in spacy_doc])) / len(spacy_doc)
    noun_phrases_count = len(blob.noun_phrases)

    return pd.Series({
        'avg_sent_len': avg_sent_len,
        'sentiment_polarity': sentiment_polarity,
        'sentiment_subjectivity': sentiment_subjectivity,
        'lexical_diversity': lexical_diversity,
        'noun_phrases_count': noun_phrases_count
    })

def prepare_dataset(texts, labels):
    features_list = []  

    for text in tqdm(texts):
        features = extract_features(str(text))
        features_list.append(features)

    feature_df = pd.DataFrame(features_list)

    return feature_df, np.array(labels)

In [18]:
x, y = prepare_dataset(texts, labels)

100%|███████████████████████████████████████████████████████████████████████| 600/600 [00:36<00:00, 16.65it/s]


In [19]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.70      0.78        66
           1       0.71      0.89      0.79        54

    accuracy                           0.78       120
   macro avg       0.80      0.79      0.78       120
weighted avg       0.80      0.78      0.78       120



In [20]:
x_test, y_test = prepare_dataset(test_texts, test_labels)
rf_prediction = rf_model.predict(x_test)
np.sum(rf_prediction[rf_prediction == y_test])

100%|█████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 17.24it/s]


29

## PCA

In [21]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

In [22]:
features = ['avg_sent_len', 'sentiment_polarity', 'sentiment_subjectivity', 'lexical_diversity', 'noun_phrases_count']
x_pca = x.loc[:, features].values
x_pca = StandardScaler().fit_transform(x_pca)
y_pca = pd.Series(y)

In [23]:
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(x_pca)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
finalDf = pd.concat([principalDf, y_pca], axis = 1)

In [24]:
from sklearn.neighbors import KNeighborsClassifier

x_knn = finalDf[['PC1', 'PC2']]  
y_knn = finalDf[0]  
X_train, X_test, y_train, y_test = train_test_split(x_knn, y_knn, test_size = 0.2, random_state = 42)

knn = KNeighborsClassifier(n_neighbors = 16)
knn.fit(X_train, y_train)

In [25]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_pred = knn.predict(X_test)

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(f"Accuracy: {accuracy_score(y_test, y_pred) * 100:.2f}%")

[[44 22]
 [20 34]]
              precision    recall  f1-score   support

           0       0.69      0.67      0.68        66
           1       0.61      0.63      0.62        54

    accuracy                           0.65       120
   macro avg       0.65      0.65      0.65       120
weighted avg       0.65      0.65      0.65       120

Accuracy: 65.00%


In [26]:
def predict_cluster(text, label, pca, knn_model, scaler):
    x, y = prepare_dataset([text], [label])
    x_scaled = scaler.fit_transform(x)
    p = pd.DataFrame(data = pca.transform(x_scaled), columns = ['PC1', 'PC2'])
    prediction = knn_model.predict(p)
    probabilities = knn_model.predict_proba(p)

    #confidence = np.max(probabilities) * 100  
    
    return prediction, probabilities 

In [27]:
x_test, y_test = prepare_dataset(test_texts, test_labels)
principalComponents = pca.fit_transform(x_test)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
prediction = knn.predict(principalDf)

100%|█████████████████████████████████████████████████████████████████████████| 60/60 [00:03<00:00, 17.32it/s]


In [28]:
np.sum(prediction[prediction == y_test])

12

## Different human text

In [29]:
import re

def clean_text(text):
    return re.sub(r'\n+', ' ', text)

In [31]:
persuade_df = pd.read_csv("persuade.csv")
filtered_df = persuade_df['full_text'].iloc[700:800]
filtered_df = filtered_df.apply(clean_text)
filtered_df_label = np.ones(shape = [100])

In [32]:
filtered_df

700    In today's modern society almost every person ...
701    Today we as humans live in a society that is c...
702    Texting and driving has become a huge problem ...
703    Proper Driver Cellphone Usage The question of ...
704    Responsible Drivers The share of American adul...
                             ...                        
795    I think while driving you should not be able t...
796    Phones and driving Everyday people die in car ...
797    Texting & driving Should drivers be able to us...
798    Phones and Driving Cell phones have become ver...
799    Distracted Driving Distracted driving has beco...
Name: full_text, Length: 100, dtype: object

In [33]:
bow_prediction = bow_model.predict(filtered_df)
np.sum(bow_prediction[bow_prediction == filtered_df_label])

97

In [34]:
tfidf_prediction = tfidf_model.predict(filtered_df)
np.sum(tfidf_prediction[tfidf_prediction == filtered_df_label])

99

In [35]:
ngram_prediction = ngram_model.predict(filtered_df)
np.sum(ngram_prediction[ngram_prediction == filtered_df_label])

99

In [36]:
svm_prediction = svm_model.predict(filtered_df)
np.sum(svm_prediction[svm_prediction == filtered_df_label])

91

In [37]:
x_test, y_test = prepare_dataset(filtered_df, filtered_df_label)
rf_prediction = rf_model.predict(x_test)
np.sum(rf_prediction[rf_prediction == y_test])

100%|███████████████████████████████████████████████████████████████████████| 100/100 [00:08<00:00, 12.30it/s]


80

In [38]:
x_test, y_test = prepare_dataset(filtered_df, filtered_df_label)
principalComponents = pca.fit_transform(x_test)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
prediction = knn.predict(principalDf)
np.sum(prediction[prediction == y_test])

100%|███████████████████████████████████████████████████████████████████████| 100/100 [00:07<00:00, 12.51it/s]


36

## ChatGPT

In [40]:
gpt_df = pd.read_csv("chat_essay.csv")
gpt_df = gpt_df.sample(frac = 1)
gpt_df = gpt_df['Text'].iloc[0:100]
gpt_df = gpt_df.apply(clean_text)
gpt_df_label = np.zeros(shape = [100], dtype = int)

In [41]:
gpt_df

290     I believe that summer projects for students sh...
1487    Seeking multiple opinions can help someone mak...
197     Limiting car usage can provide numerous benefi...
1497    Seeking multiple opinions can help someone mak...
1247    Dear Principal, I am writing to express my opi...
                              ...                        
783     Introduction: The Face on Mars has long been a...
1142    Dear Senator, I am writing to express my suppo...
1053    Introduction: Driverless cars have been a topi...
1386    I believe that students would benefit from bei...
864     Dear Principal, I am writing to express my sup...
Name: Text, Length: 100, dtype: object

In [42]:
bow_prediction = bow_model.predict(gpt_df)
np.sum(bow_prediction == gpt_df_label)

100

In [43]:
tfidf_prediction = tfidf_model.predict(gpt_df)
np.sum(tfidf_prediction == gpt_df_label)

100

In [44]:
ngram_prediction = ngram_model.predict(gpt_df)
np.sum(ngram_prediction == gpt_df_label)

100

In [45]:
svm_prediction = svm_model.predict(gpt_df)
np.sum(svm_prediction == gpt_df_label)

100

In [46]:
x_test, y_test = prepare_dataset(gpt_df, gpt_df_label)
rf_prediction = rf_model.predict(x_test)
np.sum(rf_prediction == gpt_df_label)

100%|███████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.27it/s]


86

In [47]:
x_test, y_test = prepare_dataset(gpt_df, gpt_df_label)
principalComponents = pca.fit_transform(x_test)
principalDf = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
prediction = knn.predict(principalDf)
np.sum(prediction == gpt_df_label)

100%|███████████████████████████████████████████████████████████████████████| 100/100 [00:03<00:00, 25.50it/s]


60

# GPT and Human

In [48]:
filtered_human = pd.DataFrame()
persuade_df = pd.read_csv("persuade.csv")
sample_ = persuade_df['full_text'].iloc[700:800]
sample_ = sample_.apply(clean_text)
filtered_human['TEXT'] = sample_
filtered_human['LABEL'] = 1

filtered_gpt = pd.DataFrame()
gpt_df = pd.read_csv("chat_essay.csv")
gpt_df = gpt_df.sample(frac = 1)
sample_ = gpt_df['Text'].iloc[0:100]
sample_ = sample_.apply(clean_text)
filtered_gpt['TEXT'] = sample_
filtered_gpt['LABEL'] = 0

gpt_human = pd.concat([filtered_human, filtered_gpt], ignore_index=True)
gpt_human = gpt_human.sample(frac = 1)

In [49]:
gpt_human

Unnamed: 0,TEXT,LABEL
52,Do you text while driving? Do you text while d...,1
68,Phones & Driving Using cell phones while drivi...,1
84,Everyday fatalities happen from the use cell p...,1
140,Introduction: The Face on Mars has long been a...,0
98,Phones and Driving Cell phones have become ver...,1
...,...,...
94,Phones & Driving Essay I feel like drivers sho...,1
184,The use of the Facial Action Coding System (FA...,0
59,Should or Shouldn't drivers use devices while ...,1
23,Everyday people die in car accidents because t...,1


In [50]:
bow_prediction = bow_model.predict(gpt_human['TEXT'])
np.sum(bow_prediction == gpt_human['LABEL'])

197

In [51]:
tfidf_prediction = tfidf_model.predict(gpt_human['TEXT'])
np.sum(tfidf_prediction == gpt_human['LABEL'])

195

In [52]:
ngram_prediction = ngram_model.predict(gpt_human['TEXT'])
np.sum(ngram_prediction == gpt_human['LABEL'])

199

## Paraphrased by AI

In [53]:
filtered_human['TEXT'].iloc[2]

'Texting and driving has become a huge problem throughout the world teens and adults alike. Most people when think it is no big deal and that they can do it as long as they get caught. They think well I am a good driver I won\'t crash until it actually happens. First things first, people adults and teens should not under any condition be texting and driving. There is no excuse or any solid reason you should be doing it. We all think it isn\'t like most people do it. People think its rare and not a big deal. Suprise suprise according to the article,"Texting and Driving Accident Statistics" the editor states,"In fact, at any given time throughout the day, approximately 660,000 drivers are attempting to use their phones while behind the wheel of an automobile". This is a very startling fact. By the author saying this it means that at any given time 660,000 or more people are putting the lives of the people in the car and those around them in great danger. By the same token, texting and dr

In [54]:
txt = "Texting while driving has emerged as a significant issue globally, affecting both teenagers and adults alike. Many individuals perceive it as inconsequential and believe they can engage in it without consequences until they are caught. They often hold the misguided belief that their driving skills are sufficient to prevent accidents, until a collision occurs. Firstly, it is imperative that both adults and teens refrain from texting while driving under any circumstances. There is simply no excuse or valid reason to engage in such behavior. Despite common misconceptions that texting while driving is uncommon or harmless, statistics reveal a shocking reality. According to the article 'Texting and Driving Accident Statistics,' approximately 660,000 drivers attempt to use their phones while driving at any given time, posing a grave danger to themselves and others. Furthermore, texting while driving claims numerous lives worldwide on a daily basis. Despite the assumption that responsible and cautious drivers are immune to accidents while texting, the reality is grim. The National Safety Council reports that cell phone use while driving contributes to 1.6 million crashes annually, underscoring the severity of the issue. Alarmingly, texting while driving is six times more likely to cause an accident than driving under the influence of alcohol, highlighting the urgent need to address this problem. Efforts to mitigate texting while driving include technological features in some vehicles, such as hands-free calling and text messaging capabilities, as well as systems that prevent the vehicle from starting if a cell phone is in use. However, there are also simple steps individuals can take to avoid texting while driving, such as keeping the phone out of reach and making a commitment to refrain from using it while behind the wheel. It is imperative to tackle this issue head-on, as allowing it to persist will only result in more lives lost. In conclusion, the evidence presented here clearly indicates the detrimental impact of texting while driving. We must educate both young people and adults about the risks associated with this behavior. Before succumbing to the temptation to text and drive, individuals should consider whether sending a message is worth risking someone's life."

In [55]:
bow_model.predict([filtered_human['TEXT'].iloc[2]])

array([1])

In [56]:
bow_model.predict([txt])

array([0])

In [57]:
tfidf_model.predict([txt])

array([0])

In [58]:
ngram_model.predict([txt])

array([0])

## Competition

In [61]:
comp_df = pd.read_csv("train_essays_comp.csv")
comp_df_human = comp_df.query('generated == 0')

In [62]:
comp_df_human['generated'] = 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp_df_human['generated'] = 1


In [63]:
comp_df_human

Unnamed: 0,id,prompt_id,text,generated
0,0059830c,0,Cars. Cars have been around since they became ...,1
1,005db917,0,Transportation is a large necessity in most co...,1
2,008f63e3,0,"""America's love affair with it's vehicles seem...",1
3,00940276,0,How often do you ride in a car? Do you drive a...,1
4,00c39458,0,Cars are a wonderful thing. They are perhaps o...,1
...,...,...,...,...
1373,fe6ff9a5,1,There has been a fuss about the Elector Colleg...,1
1374,ff669174,0,Limiting car usage has many advantages. Such a...,1
1375,ffa247e0,0,There's a new trend that has been developing f...,1
1376,ffc237e9,0,As we all know cars are a big part of our soci...,1


In [64]:
bow_prediction = bow_model.predict(comp_df_human['text'])
np.sum(bow_prediction == comp_df_human['generated'])

1358

## DGAIT

In [65]:
# 0 for human 
dgait_df = pd.read_csv("daigt_train.csv")
dgait_df.head()

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False


In [71]:
dgait_df_human = dgait_df.query('label == 0')
dgait_df_ai = dgait_df.query('label == 1')

In [73]:
dgait_df_human

Unnamed: 0,text,label,prompt_name,source,RDizzl3_seven
0,Phones\n\nModern humans today are always on th...,0,Phones and driving,persuade_corpus,False
1,This essay will explain if drivers should or s...,0,Phones and driving,persuade_corpus,False
2,Driving while the use of cellular devices\n\nT...,0,Phones and driving,persuade_corpus,False
3,Phones & Driving\n\nDrivers should not be able...,0,Phones and driving,persuade_corpus,False
4,Cell Phone Operation While Driving\n\nThe abil...,0,Phones and driving,persuade_corpus,False
...,...,...,...,...,...
43479,There has been a fuss about the Elector Colleg...,0,Does the electoral college work?,train_essays,True
43480,Limiting car usage has many advantages. Such a...,0,Car-free cities,train_essays,True
43481,There's a new trend that has been developing f...,0,Car-free cities,train_essays,True
43482,As we all know cars are a big part of our soci...,0,Car-free cities,train_essays,True


In [95]:
len(dgait_df_ai)

17497

In [99]:
len(dgait_df_ai.unique())

17497

In [74]:
dgait_df_human = dgait_df_human['text']
dgait_df_ai = dgait_df_ai['text']

In [75]:
dgait_df_human = dgait_df_human.apply(clean_text)
dgait_df_ai = dgait_df_ai.apply(clean_text)

In [79]:
# gpt_df_label = np.zeros(shape = [100], dtype = int)
dgait_df_human_label  = np.ones(shape = [len(dgait_df_human)], dtype = int)
dgait_df_ai_label  = np.zeros(shape = [len(dgait_df_ai)], dtype = int)

In [82]:
bow_prediction = bow_model.predict(dgait_df_human)
np.sum(bow_prediction == dgait_df_human_label)

27086

In [83]:
len(dgait_df_human)

27371

In [84]:
len(dgait_df_ai)

17497

In [96]:
bow_prediction = bow_model.predict(dgait_df_ai)
np.sum(bow_prediction == dgait_df_ai_label)

11672