In [25]:
import os
print(os.getcwd())

/home/vivek/code/nghia95/fake-data-detector/notebooks


Initial Data Load and Check 

In [1]:
import pandas as pd

# Define the updated file path
file_path = '/home/vivek/code/nghia95/fake-data-detector/data/10k_sampled_dataset.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

df_copy = df.copy()


In [2]:
print(df_copy.isnull().sum())


text           0
source         0
prompt_id      0
text_length    0
word_count     0
dtype: int64


In [3]:
df_copy


Unnamed: 0,text,source,prompt_id,text_length,word_count
0,"In 1466, perhaps 40,000 people died of the pla...",Flan-T5-XXL,0,336,63
1,Amazon's yet-unnamed Lord of the Rings origina...,GLM-130B,0,2251,393
2,@Holt \n***Hi. I've just rewritten the essay....,Human,0,1994,337
3,Pleasantly surprised! Had a few options for ve...,GLM-130B,0,4182,797
4,Shakespeare’s Othello: A Tragic Hero Research ...,Human,0,12845,2143
...,...,...,...,...,...
9995,The Meaning of Civilization According to Willi...,Human,0,11346,1793
9996,Alicia was outside watering flowers. Alicia ac...,Flan-T5-XL,0,191,34
9997,Sandy was decided to take a hike in the mounta...,Flan-T5-Base,0,181,35
9998,Tommy wanted to buy a new computer. After some...,GPT-3.5,0,478,87


Install libraries

In [4]:
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

Clean the Text

In [5]:
# Text cleaning function
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers (optional)
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    return text

In [6]:
# Apply text cleaning
df_copy['cleaned_text'] = df_copy['text'].apply(clean_text)

In [7]:
df_copy['cleaned_text'][5]

'people resourcing and reward essay table of contents introduction people resourcing people strategy employee rewards conclusion reference list introduction the business world is getting more competitive each day as the competitive nature of the business environment increases there is a need for business organizations to develop strategies that will ensure that they remain competitive and do not lose popularity with the consumers one of the most important aspects that will ensure that the business organizations are able to remain competitive and meet the customer expectations is the quality of the products and services that are offered by the organization in today’s business environment customers want value for their money they are willing to pay more for the products and services that they are getting but they want to get quality products and services for the money that they pay recognizant of the need for offering quality products and services organizations are investing even more in

Create Human and AI feature 

In [8]:
# Get the sum of value counts for the 'source' column
total_counts = df_copy['source'].value_counts()

# Display the result
print(total_counts)

source
Human                       5577
GPT-3.5                     1488
Flan-T5-Base                 280
Flan-T5-Small                265
GLM-130B                     263
Bloom-7B                     257
Flan-T5-XXL                  254
GPT-4                        253
Flan-T5-Large                250
Flan-T5-XL                   236
GPT-J                        212
Claude-Instant-v1            205
GPT-NeoX                     174
Falcon-180B                  126
Claude-v1                     81
Gemini-Pro                    25
Goliath-120B                  20
Dolphin-Mixtral-8x7B          13
Cohere-Command                13
Dolphin-2.5-Mixtral-8x7B       8
Name: count, dtype: int64


In [9]:
# Create a new column 'category' based on the 'source' column
df_copy['category'] = df_copy['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy[['source', 'category']]

Unnamed: 0,source,category
0,Flan-T5-XXL,AI
1,GLM-130B,AI
2,Human,Human
3,GLM-130B,AI
4,Human,Human
...,...,...
9995,Human,Human
9996,Flan-T5-XL,AI
9997,Flan-T5-Base,AI
9998,GPT-3.5,AI


In [10]:
# Use 'text_length' and 'word_count' as additional features
X_text = df_copy['cleaned_text']
X_length = df_copy[['text_length', 'word_count']]  # These are numeric features

In [11]:
# Split dataset into train and test sets
y = df_copy['category']  # Assuming 'label' is your target column
X_train_text, X_test_text, X_train_length, X_test_length, y_train, y_test = train_test_split(X_text, X_length, y, test_size=0.2, random_state=42)

#Scaling and TF_IDF conversion

In [12]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=7000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Scale the length features
scaler = StandardScaler()
X_train_length_scaled = scaler.fit_transform(X_train_length)
X_test_length_scaled = scaler.transform(X_test_length)

In [13]:
# Combine TF-IDF features with the scaled length features
X_train_combined = hstack([X_train_tfidf, X_train_length_scaled])
X_test_combined = hstack([X_test_tfidf, X_test_length_scaled])

Model

In [14]:
# Model building: Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_combined, y_train)

Predict

In [15]:
# Predict on the test set
y_pred = model.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AI       0.90      0.92      0.91       884
       Human       0.93      0.92      0.92      1116

    accuracy                           0.92      2000
   macro avg       0.91      0.92      0.92      2000
weighted avg       0.92      0.92      0.92      2000



SVM Model

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create the SVM model with a linear kernel
model_svm = SVC(kernel='linear', random_state=42)
model_svm.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_svm = model_svm.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

          AI       0.90      0.94      0.92       884
       Human       0.95      0.92      0.93      1116

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000



K-Nearest Neighbors (KNN)

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create the KNN model
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_knn = model_knn.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_knn))


              precision    recall  f1-score   support

          AI       0.85      0.82      0.84       884
       Human       0.86      0.88      0.87      1116

    accuracy                           0.86      2000
   macro avg       0.86      0.85      0.85      2000
weighted avg       0.86      0.86      0.86      2000



In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create Multinomial Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = model_nb.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          AI       0.84      0.77      0.80       884
       Human       0.83      0.89      0.86      1116

    accuracy                           0.83      2000
   macro avg       0.84      0.83      0.83      2000
weighted avg       0.83      0.83      0.83      2000



In [19]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Create MLP model
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
model_mlp.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_mlp = model_mlp.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_mlp))

              precision    recall  f1-score   support

          AI       0.90      0.90      0.90       884
       Human       0.92      0.92      0.92      1116

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



In [20]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Create GradientBoostingClassifier model
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_gb = model_gb.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

          AI       0.90      0.95      0.92       884
       Human       0.96      0.92      0.94      1116

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000



Word2vector

In [21]:
df_copy2 = df.copy()

In [22]:
# Create a new column 'category' based on the 'source' column
df_copy2['category'] = df_copy2['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy2[['source', 'category']]

Unnamed: 0,source,category
0,Flan-T5-XXL,AI
1,GLM-130B,AI
2,Human,Human
3,GLM-130B,AI
4,Human,Human
...,...,...
9995,Human,Human
9996,Flan-T5-XL,AI
9997,Flan-T5-Base,AI
9998,GPT-3.5,AI


Save the model 

In [23]:
import os
import numpy as np
import pandas as pd
import spacy
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Assuming df_copy2 is your DataFrame and it has a 'text' column and 'category' column
# Ensure that the text column is of string type and tokenize the text using spaCy
df_copy2['text'] = df_copy2['text'].astype(str)

# Step 1: Tokenize each text in df_copy2['text'] using spaCy
tokenized_texts = [[token.text.lower() for token in nlp(text)] for text in df_copy2['text']]

# Step 2: Train the Word2Vec model on the tokenized text
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Convert text to Word2Vec embeddings (average the word vectors for each document)
def get_average_word2vec(tokens_list, model, vector_size=100):
    # Get vectors for each token and average them
    word_vecs = [model.wv[token] for token in tokens_list if token in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)  # Return zero vector if no words are found
    else:
        return np.mean(word_vecs, axis=0)  # Take the mean of the vectors

# Get the embeddings for each text (document)
X = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in tokenized_texts])

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df_copy2['category'], test_size=0.2, random_state=42)

# Step 5: Train a supervised learning model (Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

# Ensure the 'export' directory exists
os.makedirs("export", exist_ok=True)

# Save the Word2Vec model
word2vec_model_path = 'export/word2vec_model'
word2vec_model.save(word2vec_model_path)

# Save the Logistic Regression model
logreg_model_path = 'export/logistic_regression_model.pkl'
joblib.dump(clf, logreg_model_path)

print(f"Models saved to {word2vec_model_path} and {logreg_model_path}")


Accuracy: 87.90%
Models saved to export/word2vec_model and export/logistic_regression_model.pkl


Prepare the text for predict

In [33]:


# Ensure that the test text is processed into a format the model expects
test_text = ["This is not AI, is it? Come on, I am a human being. You cannot call me AI. I am a human. See, I am typing. Can you?"]
# Tokenize the test text (similar to how you processed your training data)
tokenized_test_text = [token.text.lower() for token in nlp(test_text[0])]

# Get the Word2Vec embeddings for the tokenized test text
test_vector = get_average_word2vec(tokenized_test_text, word2vec_model)

# Since the model expects a 2D array, we reshape the test vector
test_vector_reshaped = np.array([test_vector])

# Now, predict using the Logistic Regression model
predicted_category = clf.predict(test_vector_reshaped)

print(f"Predicted category: {predicted_category[0]}")

Predicted category: AI
