In [55]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline

import string

import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords


In [2]:
data = pd.read_csv('AI_Human.csv')
print(data.head())
print(data.tail())

                                                text  generated
0  Cars. Cars have been around since they became ...        0.0
1  Transportation is a large necessity in most co...        0.0
2  "America's love affair with it's vehicles seem...        0.0
3  How often do you ride in a car? Do you drive a...        0.0
4  Cars are a wonderful thing. They are perhaps o...        0.0
                                                     text  generated
487230  Tie Face on Mars is really just a big misunder...        0.0
487231  The whole purpose of democracy is to create a ...        0.0
487232  I firmly believe that governments worldwide sh...        1.0
487233  I DFN't agree with this decision because a LFT...        0.0
487234  Richard Non, Jimmy Carter, and Bob Dole and ot...        0.0


In [3]:
data = data[:10000]
data.tail()

Unnamed: 0,text,generated
9995,This argument of the face on Mars been going o...,0.0
9996,"What happen on Mars. In May 24, 2001. One of o...",0.0
9997,"The face on Mars wasn't created by aliens, in ...",0.0
9998,"40 years ago, on mars, NASA caught a picture o...",0.0
9999,The landform on Mars is very similar to the me...,0.0


In [4]:
print((data['generated']).count())
print((data['generated'] == 0.0).sum())
print((data['generated'] == 1.0).sum())
print(data['text'][1])

10000
8964
1036
Transportation is a large necessity in most countries worldwide. With no doubt, cars, buses, and other means of transportation make going from place to place easier and faster. However there's always a negative pollution. Although mobile transportation are a huge part of daily lives, we are endangering the Earth with harmful greenhouse gases, which could be suppressed.

A small suburb community in Germany called Vauban, has started a "carfree" lifestyle. In this city, markets and stores are placed nearby homes, instead of being located by farend highways. Although Vauban is not completely carfree, 70% of Vauban families do not own cars Even a large 57% of families stated to have sold their cars to move to Vauban. Some families have even said to be less stressed depending on car transportation. Cars are responsible for about 12% of greenhouse gases, and can even be up to 50% in some carintensive areas in the United States.

Another insight to reduced car zones brings Par

In [36]:
# Data Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# def preprocessText(text):
#     text = ' '.join(text)
#     text = text.lower()
#     text = [word for word in text if word not in string.punctuation]
#     text = ''.join(text)
#     tokens = [token for token in text if token not in stop_words]
#     tokens = word_tokenize(text)
#     print(tokens)
#     # lemmatizedTokens = [lemmatizer.lemmatize(word) for word in text if word not in stop_words]
#     preprocessedText = ' '.join(tokens)
#     return preprocessedText

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [6]:
print(stop_words)
print(string.punctuation)

{'haven', 'can', 'them', 'himself', 'so', 'other', 'than', 'where', 'hers', "hasn't", 'needn', 'itself', "shan't", "wouldn't", 'your', 'myself', 'very', 'themselves', 'an', 'below', 'ain', 'for', 'will', 'before', 'too', 'here', 'herself', 'being', 'theirs', 'yourself', 'did', 'same', 'was', 'wasn', 'been', 'ours', 'both', "aren't", 'when', 'after', "it's", 'that', 'about', 'but', 'do', 'shouldn', 'once', 'didn', 'further', "you'll", "that'll", 'it', 'the', 'and', 'down', 'each', 'which', 'only', 'don', 'won', 'his', 'these', 'have', 'at', 'should', 'there', 'has', 'off', 'm', 'yourselves', 'on', 'any', 'mightn', "wasn't", 'i', 'just', 'my', "you'd", 'this', 'couldn', 'again', 'not', 'out', 's', 'her', 'against', 'how', "you're", 'ma', 'until', 'yours', 'of', 'we', 'all', "she's", 'now', 'a', 'its', 'him', 'few', 'their', 've', 'then', 'most', 'am', 'nor', 'above', 'll', 'those', 'you', 'through', "haven't", 'd', 'she', 'why', 'aren', 'to', "should've", 'had', 'as', 'were', "won't", 'r

In [38]:
data['Preprocessed Text'] = data['text'].apply(preprocess_text)

In [39]:
data['Preprocessed Text'][0]

'cars cars around since became famous 1900s henry ford created built first modelt cars played major role every day lives since people starting question limiting car usage would good thing limiting use cars might good thing like matter article german suburb life goes without cars elizabeth rosenthal states automobiles linchpin suburbs middle class families either shanghai chicago tend make homes experts say huge impediment current efforts reduce greenhouse gas emissions tailpipe passenger cars responsible 12 percent greenhouse gas emissions europeand 50 percent carintensive areas united states cars main reason greenhouse gas emissions lot people driving around time getting need go article paris bans driving due smog robert duffer says paris days nearrecord pollution enforced partial driving ban clear air global city also says monday motorist evennumbered license plates ordered leave cars home fined 22euro fine 31 order would applied oddnumbered plates following day cars reason polluting

In [40]:
def pos_features(text):
    tokens = word_tokenize(text)
    pos_tags = [tag for word, tag in pos_tag(tokens)]
    return pos_tags

data['pos_features'] = data['Preprocessed Text'].apply(pos_features)

In [41]:
vectorizer = CountVectorizer()
X_word_freq = vectorizer.fit_transform(data['Preprocessed Text'])
word_freq_feature_names = vectorizer.get_feature_names_out()

In [42]:
def extract_ngrams(text, n):
    tokens = word_tokenize(text)
    ngrams_list = list(ngrams(tokens, n))
    return [' '.join(gram) for gram in ngrams_list]

data['ngrams'] = data['Preprocessed Text'].apply(lambda x: extract_ngrams(x, 2))

In [49]:
X_pos = nltk.pos_tag_sents(data['Preprocessed Text'].apply(word_tokenize))
# X_combined = []
# for i in range(len(data)):
#     combined_features = list(data['pos_features'][i]) + list(data['ngrams'][i])
#     X_combined.append(combined_features)

X_combined = data[['pos_features', 'ngrams']]

In [120]:
print(X_combined)

                                           pos_features  \
0     [NNS, NNS, IN, IN, VBD, JJ, CD, NN, NN, VBD, V...   
1     [NN, JJ, NN, NNS, VBP, NN, NNS, VBZ, VBZ, NN, ...   
2     [RB, VB, NN, NNS, VBZ, VBG, VBZ, NN, NN, NN, N...   
3     [RB, JJ, NN, NN, CD, NN, NN, NN, NN, NN, RB, V...   
4     [NNS, JJ, NN, RB, CD, NNS, JJS, NNS, NNS, NNS,...   
...                                                 ...   
9995  [NN, NN, VBZ, VBG, IN, CD, JJ, NNS, RB, JJS, V...   
9996  [JJ, NNS, MD, CD, CD, NN, NN, VBD, JJ, NNS, JJ...   
9997  [NN, NNS, VBP, VBN, NNS, NN, NN, JJ, NN, NN, N...   
9998  [CD, NNS, RB, NNS, RB, VBD, NN, JJ, NN, NN, VB...   
9999  [NN, NNS, JJ, NNS, NNS, RB, VBP, JJS, NN, NNS,...   

                                                 ngrams  
0     [cars cars, cars around, around since, since b...  
1     [transportation large, large necessity, necess...  
2     [americas love, love affair, affair vehicles, ...  
3     [often ride, ride car, car drive, drive one, o...  
4

In [64]:
X = data['Preprocessed Text']
y = data['generated']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [75]:
print(len(X_train))
print(len(y_train))

8000
8000


In [66]:
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()), 
    ('tfidf_transformer', TfidfTransformer()),  
    ('naive_bayes', MultinomialNB())])

pipeline.fit(X_train, y_train)

In [68]:
y_pred= pipeline.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         0.0       0.97      1.00      0.99      1795
         1.0       1.00      0.76      0.86       205

    accuracy                           0.98      2000
   macro avg       0.99      0.88      0.93      2000
weighted avg       0.98      0.98      0.97      2000



In [70]:
X_pos_str = [[' '.join(tag) for tag in sentence] for sentence in X_pos]

X_combined_text = [' '.join(pos_features) + ' ' + ' '.join(ngrams) for pos_features, ngrams in zip(X_pos_str, data['ngrams'])]

In [84]:
X_combined_vectorized = vectorizer.fit_transform(X_combined)

In [92]:
# Using Feature Extraction
X_flat = [' '.join(row['pos_features']) + ' ' + ' '.join(row['ngrams']) for index, row in X_combined.iterrows()]
X_flat_vectorized = vectorizer.fit_transform(X_flat)

In [93]:
X_train, X_test, y_train, y_test = train_test_split(X_flat_vectorized, y, test_size=0.2, random_state=42)
model = MultinomialNB()

model.fit(X_train, y_train)
accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.983


In [100]:
print("Shape of combinedFeaturesVectorizer:", combinedFeaturesVectorizer.shape)


Shape of combinedFeaturesVectorizer: (1, 171)


In [107]:
testText = data['Preprocessed Text'][145]
# posTags = []
# ngramsList = []
# posTags = pos_features(testText)
# ngramsList = extract_ngrams(testText,2)
# combinedFeatures = posTags + ngramsList
# print(combinedFeatures)
# combinedFeaturesVectorizer = vectorizer.transform([' '.join(combinedFeatures)])
# predictedLabel = model.predict(combinedFeaturesVectorizer)
# print(f'Predicted Label:{predictedLabel[0]}')

pos_tags = [tag for _, tag in pos_tag(word_tokenize(testText))]

tokens = word_tokenize(testText)
bi_grams = list(ngrams(tokens, 2))  
tri_grams = list(ngrams(tokens, 3))  

bi_grams_str = [' '.join(gram) for gram in bi_grams]
tri_grams_str = [' '.join(gram) for gram in tri_grams]

combined_features = pos_tags + bi_grams_str + tri_grams_str
test_sample_vectorized = vectorizer.transform([' '.join(combined_features)])
prediction = model.predict(test_sample_vectorized)
print("Predicted label:", prediction)

ValueError: X has 171 features, but MultinomialNB is expecting 50656 features as input.

In [108]:
print("Shape of test_sample_vectorized:", test_sample_vectorized.shape)

Shape of test_sample_vectorized: (1, 171)


In [86]:
print("Shape of X_combined:", len(X_combined))
print("Shape of y:", len(y))

# Print the contents of X_combined and y
print("Contents of X_combined:", X_combined)
print("Contents of y:", y)

Shape of X_combined: 10000
Shape of y: 10000
Contents of X_combined:                                            pos_features  \
0     [NNS, NNS, IN, IN, VBD, JJ, CD, NN, NN, VBD, V...   
1     [NN, JJ, NN, NNS, VBP, NN, NNS, VBZ, VBZ, NN, ...   
2     [RB, VB, NN, NNS, VBZ, VBG, VBZ, NN, NN, NN, N...   
3     [RB, JJ, NN, NN, CD, NN, NN, NN, NN, NN, RB, V...   
4     [NNS, JJ, NN, RB, CD, NNS, JJS, NNS, NNS, NNS,...   
...                                                 ...   
9995  [NN, NN, VBZ, VBG, IN, CD, JJ, NNS, RB, JJS, V...   
9996  [JJ, NNS, MD, CD, CD, NN, NN, VBD, JJ, NNS, JJ...   
9997  [NN, NNS, VBP, VBN, NNS, NN, NN, JJ, NN, NN, N...   
9998  [CD, NNS, RB, NNS, RB, VBD, NN, JJ, NN, NN, VB...   
9999  [NN, NNS, JJ, NNS, NNS, RB, VBP, JJS, NN, NNS,...   

                                                 ngrams  
0     [cars cars, cars around, around since, since b...  
1     [transportation large, large necessity, necess...  
2     [americas love, love affair, affair vehicl