In [1]:
import os
print(os.getcwd())

/home/vivek/code/nghia95/fake-data-detector/notebooks


Initial Data Load and Check 

In [2]:
import pandas as pd

# Define the updated file path
file_path = '/home/vivek/code/nghia95/fake-data-detector/data/1k_sampled_dataset.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

df_copy = df.copy()


In [3]:
print(df_copy.isnull().sum())


text           0
source         0
prompt_id      0
text_length    0
word_count     0
dtype: int64


In [4]:
df_copy


Unnamed: 0,text,source,prompt_id,text_length,word_count
0,The Philosophy and Ethics of Transhumanism\n\n...,GPT-3.5,1920,2558,394
1,Crime-tracking app Citizen is launching its ow...,Flan-T5-XXL,0,378,62
2,The court in Novorossiysk gave two of the danc...,GLM-130B,0,621,109
3,"then drops the drumsticks, poses, then walks o...",GPT-J,0,513,90
4,On tally went to the beach. She found a sand d...,GPT-J,0,4984,846
...,...,...,...,...,...
995,Please write a response to ONE of the prompts ...,Human,0,4259,790
996,Snap's next-gen Spectacles will be able to lay...,Human,0,437,68
997,Visual Place Recognition (VPR) is the ability ...,Flan-T5-XL,0,382,58
998,Addressing Ex-Felon Disenfranchisement Essay\n...,Human,0,32647,4997


Install libraries

In [5]:
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

Clean the Text

In [6]:
# Text cleaning function
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers (optional)
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    return text

In [7]:
# Apply text cleaning
df_copy['cleaned_text'] = df_copy['text'].apply(clean_text)

In [8]:
df_copy['cleaned_text'][5]

'african american religious communities in the us essay the african american religious societies are recognized for their contribution of making the current united states according to the history of america the african american religion has lived like a refuge in a very harsh white society however no matter how harshly they were treated they remained as a cultural identity and a show of resistance in a white occupied society wuthnow the social condition of black people is openly portrayed through the religion that they practice according to the research done by several scholars it is evident that african american religion revives the african religion in a more diversified manner the practice of african american religion has led to a mixture of religious cultural practices of african and european american elements the african american religious people have a history of handling the needs of their fellow christians it is evident of how these people suffered in the hands of the whites the

Create Human and AI feature 

In [9]:
# Get the sum of value counts for the 'source' column
total_counts = df_copy['source'].value_counts()

# Display the result
print(total_counts)

source
Human                   559
GPT-3.5                 138
Flan-T5-XXL              31
GPT-J                    29
Flan-T5-Large            28
GLM-130B                 26
Flan-T5-Base             26
Flan-T5-Small            26
GPT-4                    24
Claude-Instant-v1        24
Bloom-7B                 23
Flan-T5-XL               18
GPT-NeoX                 16
Falcon-180B              15
Claude-v1                10
Goliath-120B              3
Dolphin-Mixtral-8x7B      2
Gemini-Pro                1
Cohere-Command            1
Name: count, dtype: int64


In [10]:
# Create a new column 'category' based on the 'source' column
df_copy['category'] = df_copy['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy[['source', 'category']]

Unnamed: 0,source,category
0,GPT-3.5,AI
1,Flan-T5-XXL,AI
2,GLM-130B,AI
3,GPT-J,AI
4,GPT-J,AI
...,...,...
995,Human,Human
996,Human,Human
997,Flan-T5-XL,AI
998,Human,Human


In [11]:
# Use 'text_length' and 'word_count' as additional features
X_text = df_copy['cleaned_text']
X_length = df_copy[['text_length', 'word_count']]  # These are numeric features

In [12]:
# Split dataset into train and test sets
y = df_copy['category']  # Assuming 'label' is your target column
X_train_text, X_test_text, X_train_length, X_test_length, y_train, y_test = train_test_split(X_text, X_length, y, test_size=0.2, random_state=42)

#Scaling and TF_IDF conversion

In [13]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=7000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Scale the length features
scaler = StandardScaler()
X_train_length_scaled = scaler.fit_transform(X_train_length)
X_test_length_scaled = scaler.transform(X_test_length)

In [14]:
# Combine TF-IDF features with the scaled length features
X_train_combined = hstack([X_train_tfidf, X_train_length_scaled])
X_test_combined = hstack([X_test_tfidf, X_test_length_scaled])

Model

In [15]:
# Model building: Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_combined, y_train)

Predict

In [16]:
# Predict on the test set
y_pred = model.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AI       0.83      0.88      0.85        97
       Human       0.88      0.83      0.85       103

    accuracy                           0.85       200
   macro avg       0.85      0.85      0.85       200
weighted avg       0.85      0.85      0.85       200



SVM Model

In [17]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create the SVM model with a linear kernel
model_svm = SVC(kernel='linear', random_state=42)
model_svm.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_svm = model_svm.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

          AI       0.84      0.95      0.89        97
       Human       0.94      0.83      0.88       103

    accuracy                           0.89       200
   macro avg       0.89      0.89      0.88       200
weighted avg       0.89      0.89      0.88       200



K-Nearest Neighbors (KNN)

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create the KNN model
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_knn = model_knn.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_knn))


              precision    recall  f1-score   support

          AI       0.84      0.72      0.78        97
       Human       0.77      0.87      0.82       103

    accuracy                           0.80       200
   macro avg       0.81      0.80      0.80       200
weighted avg       0.81      0.80      0.80       200



In [19]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create Multinomial Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = model_nb.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          AI       1.00      0.13      0.24        97
       Human       0.55      1.00      0.71       103

    accuracy                           0.58       200
   macro avg       0.78      0.57      0.47       200
weighted avg       0.77      0.58      0.48       200



In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Create MLP model
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
model_mlp.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_mlp = model_mlp.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_mlp))

              precision    recall  f1-score   support

          AI       0.88      0.91      0.89        97
       Human       0.91      0.88      0.90       103

    accuracy                           0.90       200
   macro avg       0.90      0.90      0.89       200
weighted avg       0.90      0.90      0.90       200



In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Create GradientBoostingClassifier model
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_gb = model_gb.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

          AI       0.89      0.94      0.91        97
       Human       0.94      0.89      0.92       103

    accuracy                           0.92       200
   macro avg       0.92      0.92      0.91       200
weighted avg       0.92      0.92      0.92       200



Word2vector

In [22]:
df_copy2 = df.copy()

In [23]:
# Create a new column 'category' based on the 'source' column
df_copy2['category'] = df_copy2['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy2[['source', 'category']]

Unnamed: 0,source,category
0,GPT-3.5,AI
1,Flan-T5-XXL,AI
2,GLM-130B,AI
3,GPT-J,AI
4,GPT-J,AI
...,...,...
995,Human,Human
996,Human,Human
997,Flan-T5-XL,AI
998,Human,Human


In [24]:
import spacy
import pandas as pd
from gensim.models import Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')


# Ensure that the text column is of string type and tokenize the text using spaCy
df_copy2['text'] = df_copy2['text'].astype(str)

# Step 1: Tokenize each text in df_copy2['text'] using spaCy
tokenized_texts = [[token.text.lower() for token in nlp(text)] for text in df_copy2['text']]

# Step 2: Train the Word2Vec model on the tokenized text
model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Convert text to Word2Vec embeddings (average the word vectors for each document)
def get_average_word2vec(tokens_list, model, vector_size=100):
    # Get vectors for each token and average them
    word_vecs = [model.wv[token] for token in tokens_list if token in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)  # Return zero vector if no words are found
    else:
        return np.mean(word_vecs, axis=0)  # Take the mean of the vectors

# Get the embeddings for each text (document)
X = np.array([get_average_word2vec(tokens, model) for tokens in tokenized_texts])

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df_copy2['category'], test_size=0.2, random_state=42)

# Step 5: Train a supervised learning model (Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')


Accuracy: 82.00%
