In [31]:
import pandas as pd
import warnings 
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv("AI Generated Essays Dataset.csv")

In [3]:
data.head()

Unnamed: 0,text,generated
0,"Machine learning, a subset of artificial intel...",1
1,"A decision tree, a prominent machine learning ...",1
2,"Education, a cornerstone of societal progress,...",1
3,"Computers, the backbone of modern technology, ...",1
4,"Chess, a timeless game of strategy and intelle...",1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   text       1460 non-null   object
 1   generated  1460 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 22.9+ KB


In [5]:
data.isnull().sum()

text         0
generated    0
dtype: int64

In [6]:
data["generated"].value_counts()

generated
0    1375
1      85
Name: count, dtype: int64

In [7]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

In [8]:
le=WordNetLemmatizer()

In [9]:
def data_preprocess(data):
    clean_data=re.sub("[^a-zA-Z]"," ",data)
    lower_data=clean_data.lower()
    tokenize=lower_data.split()
    lemitize_data=[le.lemmatize(word) for word in tokenize if word not in stopwords.words("english")]
    final_data=" ".join(lemitize_data)
    return final_data

In [10]:
data["text"][1]

'A decision tree, a prominent machine learning algorithm, structures decision-making in a tree-like model. Nodes represent decisions, branches signify outcomes, and leaves denote final decisions or predictions. The algorithm evaluates input features at each internal node, guiding the path to subsequent nodes until a leaf node is reached, yielding the final decision. This method accommodates both categorical and numerical data, offering interpretability through a visual representation. However, decision trees may suffer from overfitting, addressed through techniques like pruning and ensemble methods. Despite challenges, decision trees stand as versatile tools, finding applications in classification and regression tasks with a balance of simplicity and effectiveness.'

In [11]:
data["text"]=data["text"].apply(data_preprocess)

In [12]:
data["text"][1]

'decision tree prominent machine learning algorithm structure decision making tree like model node represent decision branch signify outcome leaf denote final decision prediction algorithm evaluates input feature internal node guiding path subsequent node leaf node reached yielding final decision method accommodates categorical numerical data offering interpretability visual representation however decision tree may suffer overfitting addressed technique like pruning ensemble method despite challenge decision tree stand versatile tool finding application classification regression task balance simplicity effectiveness'

In [13]:
x=data.iloc[:,:1]
y=data.iloc[:,1:]

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
train_x,test_x,train_y,test_y=train_test_split(x,y,test_size=0.2,random_state=42)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=1000,ngram_range=(1,2))

In [17]:
bow_train_x=cv.fit_transform(train_x["text"]).toarray()
bow_test_x=cv.fit_transform(test_x["text"]).toarray()

In [18]:
bow_train_x

array([[0, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [19]:
from imblearn.over_sampling import SMOTE
sampling=SMOTE()
bow_train_x,train_y=sampling.fit_resample(bow_train_x,train_y)

In [20]:
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB

In [21]:
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score,roc_auc_score

In [22]:
def get_metrics(actual,predicted):
    acc=accuracy_score(actual,predicted)
    cf=confusion_matrix(actual,predicted)
    pr=precision_score(actual,predicted)
    rc=recall_score(actual,predicted)
    f1=f1_score(actual,predicted)
    roc=roc_auc_score(actual,predicted)
    return acc,cf,pr,rc,f1,roc

In [28]:
models={
    "BernoulliNB":BernoulliNB(),
    "MultinomialNB":MultinomialNB(),
    "GaussianNB":GaussianNB()
}

In [32]:
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(bow_train_x,train_y)

    train_predict=model.predict(bow_train_x)
    test_predict=model.predict(bow_test_x)

    train_acc,train_cf,train_pr,train_rc,train_f1,train_roc=get_metrics(train_y,train_predict)
    test_acc,test_cf,test_pr,test_rc,test_f1,test_roc=get_metrics(test_y,test_predict)

    print(list(models.keys())[i])
    print("---------------------------------------")
    print("training data:")
    print("accuracy score:",train_acc)
    print("confusion metrics:\n",train_cf)
    print("precision score:",train_pr)
    print("recall score:",train_rc)
    print("f1-score:",train_f1)
    print("roc auc score:",train_roc)
    print("---------------------------------------")
    print("testing data:")
    print("accuracy score:",test_acc)
    print("confusion metrics:\n",test_cf)
    print("precision score:",test_pr)
    print("recall score:",test_rc)
    print("f1-score:",test_f1)
    print("roc auc score:",test_roc)
    print("====================================================================")
    print("\n")

BernoulliNB
---------------------------------------
training data:
accuracy score: 0.99909338168631
confusion metrics:
 [[1103    0]
 [   2 1101]]
precision score: 1.0
recall score: 0.9981867633726201
f1-score: 0.9990925589836661
roc auc score: 0.9990933816863101
---------------------------------------
testing data:
accuracy score: 0.9897260273972602
confusion metrics:
 [[272   0]
 [  3  17]]
precision score: 1.0
recall score: 0.85
f1-score: 0.918918918918919
roc auc score: 0.925


MultinomialNB
---------------------------------------
training data:
accuracy score: 0.9138712601994561
confusion metrics:
 [[1103    0]
 [ 190  913]]
precision score: 1.0
recall score: 0.827742520398912
f1-score: 0.9057539682539683
roc auc score: 0.9138712601994561
---------------------------------------
testing data:
accuracy score: 0.934931506849315
confusion metrics:
 [[272   0]
 [ 19   1]]
precision score: 1.0
recall score: 0.05
f1-score: 0.09523809523809523
roc auc score: 0.525


GaussianNB
-----------