In [10]:
import os
print(os.getcwd())

/home/vivek/code/nghia95/fake-data-detector/notebooks


Initial Data Load and Check 

In [11]:
import pandas as pd

# Define the updated file path
file_path = '/home/vivek/code/nghia95/fake-data-detector/data/5k_sampled_dataset.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

df_copy = df.copy()


In [12]:
print(df_copy.isnull().sum())


text           0
source         0
prompt_id      0
text_length    0
word_count     0
dtype: int64


In [13]:
df_copy


Unnamed: 0,text,source,prompt_id,text_length,word_count
0,... invention ~~for~~ of the last centuries. ~...,Human,0,1524,229
1,Self-Defense Techniques in Martial Arts Essay\...,Human,0,3564,597
2,The impact of robotics on the future of work i...,GPT-3.5,2874,3858,615
3,Taller individuals will likely need a longer t...,Flan-T5-XXL,0,462,80
4,"The Face on Mars, captured by NASA's Viking 1 ...",Falcon-180B,7,2078,347
...,...,...,...,...,...
4995,Karen Springen’s “Why We Tuned Out” Essay\n\nI...,Human,0,3693,596
4996,Second characteristic being intellectual. Inte...,Human,0,1270,224
4997,**Adult youths are often called up for working...,Human,0,2064,371
4998,Case of Westpac Bank & St. George Bank Merger ...,Human,0,4359,713


Install libraries

In [14]:
import matplotlib.pyplot as plt
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack

Clean the Text

In [15]:
# Text cleaning function
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove numbers (optional)
    text = re.sub(r'\d+', '', text)
    # Convert to lowercase
    text = text.lower()
    # Remove extra spaces
    text = ' '.join(text.split())
    return text

In [16]:
# Apply text cleaning
df_copy['cleaned_text'] = df_copy['text'].apply(clean_text)

In [17]:
df_copy['cleaned_text'][5]

'honestly nothing special people still eat there dont ask me food is bland theye skimp on hummus man dunk the stuff on my plate its good for me i want a little but they gave you as much or more than most restaurants and that includes of fast food places just sad was this helpfulhelpfulundecided'

Create Human and AI feature 

In [18]:
# Get the sum of value counts for the 'source' column
total_counts = df_copy['source'].value_counts()

# Display the result
print(total_counts)

source
Human                       2784
GPT-3.5                      751
Flan-T5-XXL                  145
Flan-T5-XL                   143
Flan-T5-Base                 136
GPT-J                        128
GPT-4                        120
Bloom-7B                     118
GLM-130B                     117
Flan-T5-Large                109
Claude-Instant-v1            108
GPT-NeoX                     107
Flan-T5-Small                105
Falcon-180B                   60
Claude-v1                     37
Goliath-120B                   9
Gemini-Pro                     9
Cohere-Command                 6
Dolphin-2.5-Mixtral-8x7B       4
Dolphin-Mixtral-8x7B           4
Name: count, dtype: int64


In [19]:
# Create a new column 'category' based on the 'source' column
df_copy['category'] = df_copy['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy[['source', 'category']]

Unnamed: 0,source,category
0,Human,Human
1,Human,Human
2,GPT-3.5,AI
3,Flan-T5-XXL,AI
4,Falcon-180B,AI
...,...,...
4995,Human,Human
4996,Human,Human
4997,Human,Human
4998,Human,Human


In [20]:
# Use 'text_length' and 'word_count' as additional features
X_text = df_copy['cleaned_text']
X_length = df_copy[['text_length', 'word_count']]  # These are numeric features

In [21]:
# Split dataset into train and test sets
y = df_copy['category']  # Assuming 'label' is your target column
X_train_text, X_test_text, X_train_length, X_test_length, y_train, y_test = train_test_split(X_text, X_length, y, test_size=0.2, random_state=42)

#Scaling and TF_IDF conversion

In [22]:
# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=7000)
X_train_tfidf = vectorizer.fit_transform(X_train_text)
X_test_tfidf = vectorizer.transform(X_test_text)

# Scale the length features
scaler = StandardScaler()
X_train_length_scaled = scaler.fit_transform(X_train_length)
X_test_length_scaled = scaler.transform(X_test_length)

In [23]:
# Combine TF-IDF features with the scaled length features
X_train_combined = hstack([X_train_tfidf, X_train_length_scaled])
X_test_combined = hstack([X_test_tfidf, X_test_length_scaled])

Model

In [24]:
# Model building: Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train_combined, y_train)

Predict

In [25]:
# Predict on the test set
y_pred = model.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          AI       0.86      0.91      0.89       440
       Human       0.93      0.88      0.90       560

    accuracy                           0.90      1000
   macro avg       0.89      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



SVM Model

In [26]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Create the SVM model with a linear kernel
model_svm = SVC(kernel='linear', random_state=42)
model_svm.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_svm = model_svm.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_svm))

              precision    recall  f1-score   support

          AI       0.87      0.93      0.90       440
       Human       0.94      0.89      0.92       560

    accuracy                           0.91      1000
   macro avg       0.91      0.91      0.91      1000
weighted avg       0.91      0.91      0.91      1000



K-Nearest Neighbors (KNN)

In [27]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# Create the KNN model
model_knn = KNeighborsClassifier(n_neighbors=5)
model_knn.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_knn = model_knn.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_knn))


              precision    recall  f1-score   support

          AI       0.80      0.82      0.81       440
       Human       0.86      0.84      0.85       560

    accuracy                           0.83      1000
   macro avg       0.83      0.83      0.83      1000
weighted avg       0.83      0.83      0.83      1000



In [28]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Create Multinomial Naive Bayes model
model_nb = MultinomialNB()
model_nb.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred_nb = model_nb.predict(X_test_tfidf)

# Evaluate the model
print(classification_report(y_test, y_pred_nb))

              precision    recall  f1-score   support

          AI       0.87      0.72      0.79       440
       Human       0.80      0.92      0.86       560

    accuracy                           0.83      1000
   macro avg       0.84      0.82      0.82      1000
weighted avg       0.83      0.83      0.83      1000



In [29]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Create MLP model
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
model_mlp.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_mlp = model_mlp.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_mlp))

              precision    recall  f1-score   support

          AI       0.89      0.87      0.88       440
       Human       0.90      0.92      0.91       560

    accuracy                           0.90      1000
   macro avg       0.90      0.90      0.90      1000
weighted avg       0.90      0.90      0.90      1000



In [30]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Create GradientBoostingClassifier model
model_gb = GradientBoostingClassifier(random_state=42)
model_gb.fit(X_train_combined, y_train)

# Predict on the test set
y_pred_gb = model_gb.predict(X_test_combined)

# Evaluate the model
print(classification_report(y_test, y_pred_gb))

              precision    recall  f1-score   support

          AI       0.88      0.95      0.91       440
       Human       0.96      0.90      0.93       560

    accuracy                           0.92      1000
   macro avg       0.92      0.92      0.92      1000
weighted avg       0.92      0.92      0.92      1000



Word2vector

In [31]:
df_copy2 = df.copy()

In [32]:
# Create a new column 'category' based on the 'source' column
df_copy2['category'] = df_copy2['source'].apply(lambda x: 'Human' if x == 'Human' else 'AI')

# Check the result
df_copy2[['source', 'category']]

Unnamed: 0,source,category
0,Human,Human
1,Human,Human
2,GPT-3.5,AI
3,Flan-T5-XXL,AI
4,Falcon-180B,AI
...,...,...
4995,Human,Human
4996,Human,Human
4997,Human,Human
4998,Human,Human


Save the model 

In [34]:
import os
import numpy as np
import pandas as pd
import spacy
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib

# Load spaCy's English model
nlp = spacy.load('en_core_web_sm')

# Assuming df_copy2 is your DataFrame and it has a 'text' column and 'category' column
# Ensure that the text column is of string type and tokenize the text using spaCy
df_copy2['text'] = df_copy2['text'].astype(str)

# Step 1: Tokenize each text in df_copy2['text'] using spaCy
tokenized_texts = [[token.text.lower() for token in nlp(text)] for text in df_copy2['text']]

# Step 2: Train the Word2Vec model on the tokenized text
word2vec_model = Word2Vec(sentences=tokenized_texts, vector_size=100, window=5, min_count=1, workers=4)

# Step 3: Convert text to Word2Vec embeddings (average the word vectors for each document)
def get_average_word2vec(tokens_list, model, vector_size=100):
    # Get vectors for each token and average them
    word_vecs = [model.wv[token] for token in tokens_list if token in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)  # Return zero vector if no words are found
    else:
        return np.mean(word_vecs, axis=0)  # Take the mean of the vectors

# Get the embeddings for each text (document)
X = np.array([get_average_word2vec(tokens, word2vec_model) for tokens in tokenized_texts])

# Step 4: Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df_copy2['category'], test_size=0.2, random_state=42)

# Step 5: Train a supervised learning model (Logistic Regression)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Step 6: Make predictions and evaluate the model
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')

# Ensure the 'export' directory exists
os.makedirs("export", exist_ok=True)

# Save the Word2Vec model
word2vec_model_path = 'export/word2vec_model'
word2vec_model.save(word2vec_model_path)

# Save the Logistic Regression model
logreg_model_path = 'export/logistic_regression_model.pkl'
joblib.dump(clf, logreg_model_path)

print(f"Models saved to {word2vec_model_path} and {logreg_model_path}")


Accuracy: 87.50%
Models saved to export/word2vec_model and export/logistic_regression_model.pkl
