In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Specify the file path
file_path_test = "unlabelled_test_data.csv"
file_path_training_processed = "training_data_processed.csv"

# Read the CSV file
unlabelled_test_data = pd.read_csv("unlabelled_test_data.csv")
training_data = pd.read_csv("training_data_processed.csv")

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(training_data['sentence'])
y = training_data['difficulty']

In [4]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical

# Assuming the training data is loaded into a DataFrame named 'training_data'
# with columns 'sentence' and 'difficulty'

# Label encoding the 'difficulty' column
label_encoder = LabelEncoder()
encoded_labels = label_encoder.fit_transform(training_data['difficulty'])

# Converting the encoded labels to categorical format
y_categorical = to_categorical(encoded_labels)

# Using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(training_data['sentence']).toarray()

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.20, random_state=42)

# Building the Neural Network Model
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dense(64, activation='relu'),
    Dense(y_train.shape[1], activation='softmax')  # Output layer
])

# Compiling the Model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training the Model
history = model.fit(X_train, y_train, epochs=6, batch_size=16, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print("Accuracy:", accuracy)

del model

Epoch 1/6


2023-11-27 16:49:05.225772: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Accuracy: 0.4489583373069763


Let's add more features!

In [6]:
from sklearn.preprocessing import StandardScaler

X_tfidf = vectorizer.fit_transform(training_data['sentence']).toarray()

scaler = StandardScaler()
additional_features = scaler.fit_transform(training_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity', 'PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']])

# Standardize these features
additional_features_scaled = scaler.fit_transform(additional_features)

# Combine with TF-IDF features
X_combined = np.hstack([X_tfidf, additional_features_scaled])

In [7]:
from sklearn.model_selection import KFold
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical
import numpy as np

# Number of folds
num_folds = 5

# K-Fold Cross-Validation
kfold = KFold(n_splits=num_folds, shuffle=True)

# Assuming 'X_combined' is your feature matrix and 'encoded_labels' are your labels
accuracies = []

for train, test in kfold.split(X_combined, encoded_labels):
    # Split data into training and testing sets
    X_train, X_test = X_combined[train], X_combined[test]
    y_train, y_test = encoded_labels[train], encoded_labels[test]

    # Convert labels to categorical
    y_train_categorical = to_categorical(y_train)
    y_test_categorical = to_categorical(y_test)

    # Define the model
    model = Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(y_train_categorical.shape[1], activation='softmax')
    ])

    # Compile the model
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    # Train the model
    model.fit(X_train, y_train_categorical, epochs=6, batch_size=16)

    # Evaluate the model
    loss, accuracy = model.evaluate(X_test, y_test_categorical)
    accuracies.append(accuracy)

# Calculate the average and standard deviation of the accuracies
avg_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f"Average accuracy: {avg_accuracy}, Standard Deviation: {std_accuracy}")

del model

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6
Average accuracy: 0.4962500035762787, Standard Deviation: 0.006541802613971892


In [15]:
from transformers import CamembertModel, CamembertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

# Load tokenizer and model for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

# Tokenize and encode sentences in the dataset
inputs = tokenizer(list(training_data['sentence']), padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = inputs['input_ids']

# Create a DataLoader for batch processing
batch_size = 8  # Adjust based on your system's capability
dataset = TensorDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=batch_size)

# Generate embeddings in batches
embeddings = []
model.eval()
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
embeddings = torch.cat(embeddings, dim=0).numpy()

In [16]:
try:
    del model
    #del embeddings
except:
    print("Model and embeddings already deleted")

In [17]:
import gc
gc.collect()

657

In [18]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=500)
X_text_features = vectorizer.fit_transform(training_data['sentence'])

# Linguistic Features
X_linguistic_features = sp.csr_matrix(training_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity', 'PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']])

# Combine Features
X_combined = sp.hstack([sp.csr_matrix(embeddings), X_text_features, X_linguistic_features])

#Create Interaction Terms
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_combined.toarray())

#Dimensionality Reduction
pca = PCA(n_components=50)
X_reduced = pca.fit_transform(X_combined.toarray())

# Splitting the data for training and testing
X_train, X_test, y_train, y_test = train_test_split(X_combined, training_data['difficulty'], test_size=0.2)

In [26]:
from tensorflow.keras import backend as K
K.clear_session()

In [27]:
gc.collect()

0

In [39]:
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Encode and convert labels to categorical format
label_encoder = LabelEncoder()
y_train_categorical = to_categorical(label_encoder.fit_transform(y_train))
y_test_categorical = to_categorical(label_encoder.transform(y_test))

# Building and compiling the Neural Network Model with added Dropout
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.5),  # Dropout layer to prevent overfitting
    Dense(64, activation='relu'),
    Dropout(0.5),  # Another Dropout layer
    Dense(y_train_categorical.shape[1], activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train and evaluate the Model with callbacks
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Early stopping to prevent overfitting
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

# Save the best model
model_checkpoint = ModelCheckpoint('best_model.h5', save_best_only=True)

model.fit(X_train.toarray(), y_train_categorical, epochs=20, batch_size=32, 
          validation_split=0.2, callbacks=[early_stopping, model_checkpoint])

# Evaluate the best model
model.load_weights('best_model.h5')
loss, accuracy = model.evaluate(X_test.toarray(), y_test_categorical)
print(f'Test Accuracy: {accuracy}')


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Test Accuracy: 0.5166666507720947
