# Preprocessing

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Creating a csv-file with two columns (sentence and label)
file_path = 'TRAINING_DATA.txt'

# Initializing lists to store sentences and labels
sentences = []
labels = []

# Reading the text file
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        # Split the line based on tab or other delimiter used between label and sentence
        label, sentence = line.strip().split('\t')  # Adjust delimiter if different
        sentences.append(sentence)
        labels.append(int(label))  # Convert label to integer

# Creating a DataFrame with two columns: 'Sentence' and 'Label'
data = pd.DataFrame({
    'Sentence': sentences,
    'Label': labels
})

# Saving the DataFrame to a CSV file
csv_path = "translated_sentences.csv"
data.to_csv(csv_path, index=False, encoding='utf-8')

print(f"CSV file created at: {csv_path}")

CSV file created at: translated_sentences.csv


In [3]:
df = pd.read_csv('translated_sentences.csv')
df.set_index('Label',drop=True)


Unnamed: 0_level_0,Sentence
Label,Unnamed: 1_level_1
1,"Cuando conocí a Janice en 2013 , una familia n..."
0,Hwang habló en Sur de este año por Southwest M...
1,Usted podría pensar Katy Perry y Robert Pattin...
1,Cualquiera que haya volado los cielos del crea...
1,"Bueno , este cantante tendrá un LARGO tiempo p..."
...,...
0,""" Ella repetía , como hemos luchado durante ve..."
1,Y despues de vender sus acciones en Polar Bear...
0,No es sólo malo para los pobres .
0,Mientras espera la quinta oleada aún más letal...


In [4]:
print(data.shape)
data.fillna("",inplace=True)

(17877, 2)


In [5]:
import ssl
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [6]:
import nltk
print(nltk.data.path)

['/root/nltk_data', '/usr/nltk_data', '/usr/share/nltk_data', '/usr/lib/nltk_data', '/usr/share/nltk_data', '/usr/local/share/nltk_data', '/usr/lib/nltk_data', '/usr/local/lib/nltk_data']


In [7]:
# Download stopwords to the appropriate directory
import nltk
nltk.download('stopwords', download_dir='nltk_data')

[nltk_data] Downloading package stopwords to nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [8]:
import re

def remove_special_characters(text):
    # Keep only alphabetic characters and whitespace
    cleaned_text = re.sub(r'[^A-Za-z\s]', '', text)
    return cleaned_text

# Example usage:
data = pd.read_csv("translated_sentences.csv")  # Assuming the CSV is created as per previous instructions
data['Cleaned_Sentence'] = data['Sentence'].apply(remove_special_characters)

# Save the cleaned version
data.to_csv("cleaned_sentences.csv", index=False, encoding='utf-8')

In [9]:
def clean_text(text):
    # Remove special characters and punctuation using a regular expression
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only letters and whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespace
    return cleaned_text

clean_text

In [10]:
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)

True

In [11]:
import nltk
nltk.data.path.append('/nltk_data')


In [12]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Ensure necessary resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    # Remove special characters and punctuation using a regular expression
    cleaned_text = re.sub(r'[^a-zA-ZáéíóúÁÉÍÓÚñÑ\s]', '', text)  # Keep only letters, accented vowels, and whitespace
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()  # Remove extra whitespace
    return cleaned_text

def preprocess_text(text):
    # Tokenize text using the Spanish language option
    tokens = word_tokenize(text, language='spanish')

    # Load Spanish stopwords
    stop_words = set(stopwords.words('spanish'))

    # Remove stopwords
    cleaned_tokens = [word for word in tokens if word.lower() not in stop_words]

    return cleaned_tokens

# Example usage
text = "Hola, ¿cómo estás? Estoy bien, gracias por preguntar."
cleaned_text = clean_text(text)  # Clean the text first
cleaned_tokens = preprocess_text(cleaned_text)  # Then preprocess the cleaned text
print(cleaned_tokens)


['Hola', 'cómo', 'bien', 'gracias', 'preguntar']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')

lemmatizer = WordNetLemmatizer()

def lemmatize_spanish(tokens):
    lemmatized = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    return lemmatized

# Example usage
lemmatized_tokens = lemmatize_spanish(cleaned_tokens)
print(lemmatized_tokens)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


['hola', 'cómo', 'bien', 'gracias', 'preguntar']


In [17]:
!pip install textstat

Collecting textstat
  Downloading textstat-0.7.4-py3-none-any.whl.metadata (14 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.16.0-py3-none-any.whl.metadata (3.2 kB)
Downloading textstat-0.7.4-py3-none-any.whl (105 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/105.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.1/105.1 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.16.0-py3-none-any.whl (2.1 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/2.1 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m63.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, textstat
Successfully installed pyphen-0.16.0 textstat-0.7.4


In [15]:
import numpy as np
import pandas as pd
import nltk
import spacy
from textstat import textstat
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Load spacy model for POS tagging and Named Entity Recognition
nlp = spacy.load("en_core_web_sm")  # Choose appropriate model for the target language

nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def calculate_lexical_diversity(text):
    words = [word for word in word_tokenize(text) if word.isalpha() and word not in stop_words]
    return len(set(words)) / len(words) if len(words) > 0 else 0

def calculate_readability_scores(text):
    return textstat.flesch_reading_ease(text)

# Example function for extracting lexical features
def extract_lexical_features(texts):
    lexical_features = []
    for text in texts:
        lexical_diversity = calculate_lexical_diversity(text)
        readability_score = calculate_readability_scores(text)
        lexical_features.append([lexical_diversity, readability_score])

    return np.array(lexical_features)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def calculate_pos_distribution(text):
    doc = nlp(text)
    pos_counts = Counter([token.pos_ for token in doc])
    total_tokens = sum(pos_counts.values())
    return {pos: count / total_tokens for pos, count in pos_counts.items()}

def extract_pos_features(texts):
    pos_features = []
    for text in texts:
        pos_distribution = calculate_pos_distribution(text)
        # Selecting only some POS tags for analysis (e.g., 'NOUN', 'VERB', 'ADJ', etc.)
        selected_pos = ['NOUN', 'VERB', 'ADJ', 'ADV', 'PRON']
        features = [pos_distribution.get(pos, 0) for pos in selected_pos]
        pos_features.append(features)

    return np.array(pos_features)

In [17]:
def count_named_entities(text):
    doc = nlp(text)
    named_entities = [(ent.text, ent.label_) for ent in doc.ents]
    return len(named_entities)

def extract_named_entity_features(texts):
    ner_features = [count_named_entities(text) for text in texts]
    return np.array(ner_features).reshape(-1, 1)

In [21]:
!pip install language-tool-python

Collecting language-tool-python
  Downloading language_tool_python-2.8.1-py3-none-any.whl.metadata (12 kB)
Downloading language_tool_python-2.8.1-py3-none-any.whl (35 kB)
Installing collected packages: language-tool-python
Successfully installed language-tool-python-2.8.1


In [18]:
import language_tool_python

tool = language_tool_python.LanguageTool('en-US')  # Change to the target language

def count_grammar_errors(text):
    matches = tool.check(text)
    return len(matches)

def extract_grammar_error_features(texts):
    grammar_features = [count_grammar_errors(text) for text in texts]
    return np.array(grammar_features).reshape(-1, 1)

In [19]:
from transformers import BertTokenizer, TFBertModel
from sklearn.metrics.pairwise import cosine_similarity

# Load a BERT model
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

def encode_text_bert(text):
    inputs = tokenizer(text, return_tensors="tf", truncation=True, padding="max_length", max_length=128)
    outputs = bert_model(inputs)
    return outputs.last_hidden_state[:, 0, :].numpy()  # Extract the [CLS] token representation

def calculate_semantic_similarity(text, native_corpus):
    text_vector = encode_text_bert(text)
    corpus_vectors = np.vstack([encode_text_bert(t) for t in native_corpus])
    similarities = cosine_similarity(text_vector, corpus_vectors)
    return np.mean(similarities)

# Example function for extracting semantic similarity features
def extract_semantic_features(texts, native_corpus):
    semantic_features = [calculate_semantic_similarity(text, native_corpus) for text in texts]
    return np.array(semantic_features).reshape(-1, 1)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT

In [20]:
def extract_all_features(texts, native_corpus):
    lexical_features = extract_lexical_features(texts)
    pos_features = extract_pos_features(texts)
    ner_features = extract_named_entity_features(texts)
    grammar_features = extract_grammar_error_features(texts)
    semantic_features = extract_semantic_features(texts, native_corpus)

    # Combine all features into a single feature matrix
    combined_features = np.hstack((lexical_features, pos_features, ner_features, grammar_features, semantic_features))
    return combined_features

# Feature Engineering

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer

def extract_tfidf_features(sentences):
    # TF-IDF with unigrams and bigrams
    tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_features=5000)  # Adjust max_features as needed
    tfidf_features = tfidf_vectorizer.fit_transform(sentences)
    return tfidf_features

# Example usage
tfidf_features = extract_tfidf_features(data['Sentence'])

In [22]:
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

def extract_pos_tags(sentences):
    pos_counts = []
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(tokens)
        # Calculating the count of each POS type ('NN' for noun, 'VB' for verb, 'JJ' for adjective)
        pos_counts.append({pos: sum(1 for word, tag in pos_tags if tag == pos) for pos in ['NN', 'VB', 'JJ']})
    return pos_counts


pos_features = extract_pos_tags(data['Sentence'])

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
def extract_length_features(sentences):
    length_features = []
    for sentence in sentences:
        num_words = len(sentence.split())
        num_chars = len(sentence)
        length_features.append({'word_count': num_words, 'char_count': num_chars})
    return length_features

# Example usage
length_features = extract_length_features(data['Sentence'])

In [24]:
from gensim.models import KeyedVectors

def load_glove_model(file_path):
    return KeyedVectors.load_word2vec_format(file_path, binary=False)

def get_average_word_vectors(sentences, model, num_features=300):
    # Assuming model is pre-trained with 300 dimensions
    features = []
    for sentence in sentences:
        words = sentence.split()
        word_vectors = [model[word] for word in words if word in model]
        if word_vectors:
            features.append(np.mean(word_vectors, axis=0))
        else:
            features.append(np.zeros(num_features))
    return np.array(features)



In [25]:
from textstat import flesch_reading_ease

def extract_readability_features(sentences):
    readability_scores = [flesch_reading_ease(sentence) for sentence in sentences]
    return readability_scores

# Example usage
readability_features = extract_readability_features(data['Sentence'])

In [26]:
from scipy.sparse import hstack

length_df = pd.DataFrame(length_features)
pos_df = pd.DataFrame(pos_features)
readability_df = pd.DataFrame(readability_features, columns=['readability'])

# Combining with TF-IDF features using sparse matrix stacking
final_features = hstack([tfidf_features, length_df, pos_df, readability_df])


# Train-test-split

In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split


# Step 1: Extract readability and POS features
readability_features = extract_readability_features(data['Sentence'])
pos_features = extract_pos_tags(data['Sentence'])

# Step 2: Create a new DataFrame for combined features
features_df = pd.DataFrame({
    'Readability_Score': readability_features,
    'NN_Count': [feat.get('NN', 0) for feat in pos_features],
    'VB_Count': [feat.get('VB', 0) for feat in pos_features],
    'JJ_Count': [feat.get('JJ', 0) for feat in pos_features]
})

# Combine these features with the labels
final_data = pd.concat([features_df, data['Label']], axis=1)

# Step 3: Train-test split
X = final_data.drop(columns=['Label'])  # Features
y = final_data['Label']  # Target

# Split the data (80% for training and 20% for testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display shapes of the resulting datasets
print(f"Training set: X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"Test set: X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

Training set: X_train shape: (14301, 4), y_train shape: (14301,)
Test set: X_test shape: (3576, 4), y_test shape: (3576,)


# Model Training

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
import tensorflow as tf

# Load a pre-trained BERT tokenizer and model for sequence classification
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Example text to classify
text = "This is an example sentence."

# Tokenize the input text
inputs = tokenizer(text, return_tensors='tf', truncation=True, padding=True, max_length=128)

# Run the model to get predictions
outputs = model(inputs)

# Convert logits to probabilities using softmax
probabilities = tf.nn.softmax(outputs.logits, axis=-1)

# Print the predicted class and confidence scores
predicted_class = tf.argmax(probabilities, axis=-1).numpy()
confidence_scores = probabilities.numpy()

print(f"Predicted Class: {predicted_class[0]}, Confidence Scores: {confidence_scores}")

In [None]:
import textstat

In [6]:
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.data import Dataset
import textstat
import nltk

# Ensure NLTK dependencies are downloaded
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Step 1: Load and clean training data
file_path = 'translated_sentences.csv'
data = pd.read_csv(file_path)
data.fillna("", inplace=True)

def remove_special_characters(text):
    return re.sub(r'[^A-Za-z\s]', '', text)

# Clean sentences
data['Cleaned_Sentence'] = data['Sentence'].apply(remove_special_characters)

def extract_pos_tags(sentences):
    pos_counts = []
    for sentence in sentences:
        tokens = nltk.word_tokenize(sentence)
        pos_tags = nltk.pos_tag(tokens)
        pos_counts.append({pos: sum(1 for word, tag in pos_tags if tag == pos) for pos in ['NN', 'VB', 'JJ']})
    return pos_counts

def extract_readability_features(sentences):
    return [textstat.flesch_reading_ease(sentence) for sentence in sentences]

# Extract features
readability_features = extract_readability_features(data['Sentence'])
pos_features = extract_pos_tags(data['Sentence'])

# Create features DataFrame
features_df = pd.DataFrame({
    'Readability_Score': readability_features,
    'NN_Count': [feat.get('NN', 0) for feat in pos_features],
    'VB_Count': [feat.get('VB', 0) for feat in pos_features],
    'JJ_Count': [feat.get('JJ', 0) for feat in pos_features]
})

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(features_df, data['Label'], test_size=0.2, random_state=42)

# Load BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
bert_model = TFBertModel.from_pretrained(model_name)

# Create BERT embeddings in batches
def encode_text_bert_batch(texts, tokenizer, model, batch_size=8):
    texts = texts.tolist()
    dataset = Dataset.from_tensor_slices(texts).batch(batch_size)
    bert_embeddings = []

    for batch in dataset:
        batch_text = [str(text) for text in batch.numpy()]
        inputs = tokenizer(batch_text, return_tensors="tf", padding=True, truncation=True, max_length=128)
        outputs = model(inputs)
        cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        bert_embeddings.append(cls_embeddings)

    return np.vstack(bert_embeddings)

# Generate BERT embeddings for training and testing sets
bert_embeddings_train = encode_text_bert_batch(data.loc[X_train.index, 'Sentence'], tokenizer, bert_model)
bert_embeddings_test = encode_text_bert_batch(data.loc[X_test.index, 'Sentence'], tokenizer, bert_model)

# Combine BERT embeddings with other features
final_train_features = np.hstack([bert_embeddings_train, X_train.values])
final_test_features = np.hstack([bert_embeddings_test, X_test.values])

# Train a Logistic Regression model
model = LogisticRegression(max_iter=20000)
model.fit(final_train_features, y_train)

# Evaluate model
predicted_labels = model.predict(final_test_features)
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)

print(f"Accuracy: {accuracy:.6f}")
print(f"Precision: {precision:.6f}")
print(f"Recall: {recall:.6f}")
print(f"F1 Score: {f1:.6f}")

# Step 2: Process real data
real_data_file_path = 'REAL_DATA.txt'
lines = []

with open(real_data_file_path, 'r', encoding='utf-8') as file:
    for line_number, line in enumerate(file, start=1):
        columns = line.split('\t')
        if len(columns) == 2:
            lines.append(columns)
        else:
            print(f"Skipping or correcting line {line_number}: {line.strip()}")

real_data = pd.DataFrame(lines, columns=['Label', 'Sentence'])
real_data['Label'] = real_data['Label'].astype(int)
real_data['Cleaned_Sentence'] = real_data['Sentence'].apply(remove_special_characters)

# Extract features for real data
readability_features_real = extract_readability_features(real_data['Sentence'])
pos_features_real = extract_pos_tags(real_data['Sentence'])

features_real_df = pd.DataFrame({
    'Readability_Score': readability_features_real,
    'NN_Count': [feat.get('NN', 0) for feat in pos_features_real],
    'VB_Count': [feat.get('VB', 0) for feat in pos_features_real],
    'JJ_Count': [feat.get('JJ', 0) for feat in pos_features_real]
})

# Generate BERT embeddings for real data
bert_embeddings_real = encode_text_bert_batch(real_data['Sentence'], tokenizer, bert_model)

# Combine BERT embeddings with other features for real data
final_real_features = np.hstack([bert_embeddings_real, features_real_df.values])

# Make predictions for real data
predicted_real_labels = model.predict(final_real_features)

# Replace '2' in the Label column with predictions
real_data['Label'] = predicted_real_labels

# Save modified real data
real_data.to_csv('modified_REAL_DATA.txt', sep='\t', index=False, header=False)

print("Real data file updated with predictions and saved as 'modified_REAL_DATA.txt'.")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. 

Accuracy: 0.529642
Precision: 0.540787
Recall: 0.519452
F1 Score: 0.529905
Real data file updated with predictions and saved as 'modified_REAL_DATA.txt'.


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Evaluate the model
predicted_labels = model.predict(final_test_features)

# Calculate and print evaluation metrics
accuracy = accuracy_score(y_test, predicted_labels)
precision = precision_score(y_test, predicted_labels)
recall = recall_score(y_test, predicted_labels)
f1 = f1_score(y_test, predicted_labels)

print(f"Accuracy: {accuracy:.3f}")
print(f"Precision: {precision:.3f}")
print(f"Recall: {recall:.3f}")
print(f"F1 Score: {f1:.3f}")

In [None]:
# Step 1: Load Real Data
real_data_file_path = '/mnt/data/REAL_DATA.txt'  # Adjust the path as needed
real_data = pd.read_csv(real_data_file_path, sep='\t', header=None, names=['Label', 'Sentence'])

# Step 2: Clean and preprocess sentences (if needed)
real_data['Cleaned_Sentence'] = real_data['Sentence'].apply(remove_special_characters)

# Step 3: Extract features for real data (same as training data)
readability_features_real = extract_readability_features(real_data['Sentence'])
pos_features_real = extract_pos_tags(real_data['Sentence'])

# Create feature DataFrame for real data
features_real_df = pd.DataFrame({
    'Readability_Score': readability_features_real,
    'NN_Count': [feat.get('NN', 0) for feat in pos_features_real],
    'VB_Count': [feat.get('VB', 0) for feat in pos_features_real],
    'JJ_Count': [feat.get('JJ', 0) for feat in pos_features_real]
})

# Step 4: Generate BERT embeddings for real data
bert_embeddings_real = encode_text_bert_batch(real_data['Sentence'], tokenizer, bert_model, batch_size=8)

# Step 5: Combine BERT embeddings with other features for real data
final_real_features = np.hstack([bert_embeddings_real, features_real_df.values])

# Step 6: Make predictions for real data
predicted_real_labels = model.predict(final_real_features)

# Step 7: Replace '2' in the Label column with predictions
real_data['Label'] = predicted_real_labels

# Save the modified real data to a new file
real_data.to_csv('modified_REAL_DATA.txt', sep='\t', index=False, header=False)

print("Real data file updated with predictions and saved as 'modified_REAL_DATA.txt'.")