In [29]:
import pandas as pd
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

# Load the dataset
df = pd.read_csv('dataset.csv')

# Display the first few rows of the dataframe
print(df.head())

# Display summary statistics and information about the dataset
print(df.describe())
print(df.info())

cuda
         par_id                                          paragraph  \
0  428209002237  Ramsay was born in Glasgow on 2 October 1852. ...   
1  564218010072  It has been widely estimated for at least the ...   
2  291401001672  He went on to win the Royal Medal of the Royal...   
3   31548004883  The changes have altered many underlying assum...   
4   50634005146  After these novels were published, Disraeli de...   

                        has_entity  lexicon_count  difficult_words  \
0   ORG_YES_PRODUCT_NO_PERSON_YES_             49             12.0   
1    ORG_YES_PRODUCT_NO_PERSON_NO_            166             47.0   
2    ORG_YES_PRODUCT_NO_PERSON_NO_             69             18.0   
3    ORG_NO_PRODUCT_YES_PERSON_NO_             76             27.0   
4  ORG_YES_PRODUCT_YES_PERSON_YES_            200             47.0   

  last_editor_gender                 category      text_clarity  
0                man              biographies      clear_enough  
1                man 

In [30]:
# Fill in missing 'difficult_words' values with the column's mean
df['difficult_words'].fillna(df['difficult_words'].mean(), inplace=True)

# Drop rows where 'category' is missing
df.dropna(subset=['category'], inplace=True)

# Extract binary features from 'has_entity'
df['has_product'] = df['has_entity'].apply(lambda x: 1 if 'PRODUCT_YES' in x else 0)
df['has_organization'] = df['has_entity'].apply(lambda x: 1 if 'ORG_YES' in x else 0)
df['has_person'] = df['has_entity'].apply(lambda x: 1 if 'PERSON_YES' in x else 0)

# Now, let's vectorize the 'paragraph' text
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', max_features=1000)
X_tfidf = tfidf.fit_transform(df['paragraph']).toarray()

# Incorporate 'has_product', 'has_organization', 'has_person', and 'difficult_words' into our features
import numpy as np

additional_features = df[['has_product', 'has_organization', 'has_person', 'difficult_words']].to_numpy()
X = np.hstack((X_tfidf, additional_features))

# Encode the target variable 'category'
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
y = encoder.fit_transform(df['category'])

# Split the dataset into training and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Preprocessing completed. Data is ready for model training.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['difficult_words'].fillna(df['difficult_words'].mean(), inplace=True)


Preprocessing completed. Data is ready for model training.


In [31]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Assuming 'accuracy_score' is the accuracy of your model on the test set
misclassification_rate_ml = 1 - accuracy_score(y_test, y_pred)
print(f"Misclassification Rate (Machine Learning Model): {misclassification_rate_ml}")

Confusion Matrix:
 [[  1   0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0]
 [  0   0   0   0   0   0   0   5   0]
 [  0   0   0   0   0   1   0   0   1]
 [  0   0   0   0 205  38   1  31  28]
 [  0   0   0   0   6 545   0  59  11]
 [  0   0   0   0   0   5  21   2   2]
 [  0   0   0   0  17  46   0 421  11]
 [  0   0   0   0  18  15   0  16 350]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2
           5       0.83      0.68      0.75       303
           6       0.84      0.88      0.86       621
           7       0.95      0.70      0.81        30
           8       0.79      0.85      0.82       495
           9       0.87      0.88      0.87       399

    accuracy                           0.83      1858
   macro

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [32]:
from sklearn.ensemble import GradientBoostingClassifier

# Initialize the Gradient Boosting Classifier
gb_classifier = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)

# Train the model
gb_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred_gb = gb_classifier.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))
print("Accuracy Score:", accuracy_score(y_test, y_pred_gb))

# Assuming 'accuracy_score' is the accuracy of your model on the test set
misclassification_rate_ml = 1 - accuracy_score(y_test, y_pred_gb)
print(f"Misclassification Rate (Machine Learning Model): {misclassification_rate_ml}")

Confusion Matrix:
 [[  1   0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0]
 [  0   0   0   0   0   0   0   5   0]
 [  0   0   0   0   1   0   0   1   0]
 [  1   1   1   0 214  35   4  37  10]
 [  1   1   0   0   6 561   1  43   8]
 [  0   0   0   0   1   2  27   0   0]
 [  0   1   1   0  10  54   2 423   4]
 [  0   0   1   0  12  25   0  34 327]]

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.50      0.40         2
           1       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2
           5       0.87      0.71      0.78       303
           6       0.83      0.90      0.86       621
           7       0.79      0.90      0.84        30
           8       0.78      0.85      0.82       495
           9       0.94      0.82      0.87       399

    accuracy                           0.84      1858
   macro

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [33]:
from sklearn.model_selection import GridSearchCV

# Parameters grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}

# Initialize Grid Search
grid_search = GridSearchCV(estimator=RandomForestClassifier(random_state=42),
                           param_grid=param_grid,
                           cv=3,  # Number of cross-validation folds
                           verbose=2, n_jobs=-1)

# Fit Grid Search to the data
grid_search.fit(X_train, y_train)

# Best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the best model found by Grid Search on the test set
best_model = grid_search.best_estimator_
y_pred_best = best_model.predict(X_test)
print("Accuracy Score of the Best Model:", accuracy_score(y_test, y_pred_best))
# Assuming 'accuracy_score' is the accuracy of your model on the test set
misclassification_rate_ml = 1 - accuracy_score(y_test, y_pred_best)
print(f"Misclassification Rate (Machine Learning Model): {misclassification_rate_ml}")

Fitting 3 folds for each of 12 candidates, totalling 36 fits




Best Parameters: {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 200}
Best Score: 0.8131394722670975
Accuracy Score of the Best Model: 0.8331539289558665
Misclassification Rate (Machine Learning Model): 0.16684607104413351


In [34]:
import numpy as np

def load_glove_embeddings(path):
    embeddings = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

# Assuming you've set the correct path to the GloVe embeddings file
glove_embeddings = load_glove_embeddings(r'C:\Users\muhammedazhar\Developer\MSc-DataScience\COMP1804-AML\Coursework\WordEmbeddings\glove.6B.100d.txt')
print(f"Loaded {len(glove_embeddings)} word vectors.")

Loaded 400000 word vectors.


In [35]:
def paragraph_to_vector(paragraph, embeddings, dim=100):
    words = paragraph.split()
    vector = np.zeros(dim)
    num_words = 0
    for word in words:
        if word in embeddings:
            vector += embeddings[word]
            num_words += 1
    if num_words > 0:
        vector /= num_words
    return vector

# Example usage
sample_paragraph = df['paragraph'].iloc[0]
sample_vector = paragraph_to_vector(sample_paragraph, glove_embeddings)
print(f"Vector for the first paragraph: {sample_vector[:5]}...")  # Display the first 5 elements for brevity

Vector for the first paragraph: [ 0.051743   -0.13768024  0.19746544  0.00786557  0.17483612]...


In [36]:
# Transform all paragraphs in the dataset into vectors
X_embeddings = np.array([paragraph_to_vector(p, glove_embeddings) for p in df['paragraph']])

# Display the shape of the new feature matrix to confirm transformation
print("Shape of the transformed feature matrix:", X_embeddings.shape)

Shape of the transformed feature matrix: (9286, 100)


In [37]:
# Update the dataset split to use the new embeddings
X_train_emb, X_test_emb, y_train_emb, y_test_emb = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

# Train the RandomForestClassifier on the new feature matrix
rf_classifier_emb = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier_emb.fit(X_train_emb, y_train_emb)

# Predict on the test set and evaluate
y_pred_emb = rf_classifier_emb.predict(X_test_emb)
print("Confusion Matrix:\n", confusion_matrix(y_test_emb, y_pred_emb))
print("\nClassification Report:\n", classification_report(y_test_emb, y_pred_emb))
print("Accuracy Score:", accuracy_score(y_test_emb, y_pred_emb))

# Assuming 'accuracy_score' is the accuracy of your model on the test set
misclassification_rate_ml = 1 - accuracy_score(y_test_emb, y_pred_emb)
print(f"Misclassification Rate (Machine Learning Model): {misclassification_rate_ml}")

Confusion Matrix:
 [[  1   0   0   0   1   0   0   0   0]
 [  0   0   0   0   0   1   0   0   0]
 [  0   0   0   0   0   1   0   4   0]
 [  0   0   0   0   0   0   0   1   1]
 [  0   0   0   0 203  25   0  32  43]
 [  0   0   0   0  13 544   0  61   3]
 [  0   0   0   0   2  18   5   1   4]
 [  0   0   0   0  26  41   0 413  15]
 [  0   0   0   0  19  13   0  23 344]]

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         5
           4       0.00      0.00      0.00         2
           5       0.77      0.67      0.72       303
           6       0.85      0.88      0.86       621
           7       1.00      0.17      0.29        30
           8       0.77      0.83      0.80       495
           9       0.84      0.86      0.85       399

    accuracy                           0.81      1858
   macro

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [38]:
import tensorflow as tf
print('TensorFlow vesion:', tf.__version__)
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding
y_one_hot = to_categorical(y)

# Split the data into training and testing sets
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(X_embeddings, y_one_hot, test_size=0.2, random_state=42)

TensorFlow vesion: 2.16.1


In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

# Determine the number of features (dimensions of GloVe embeddings)
input_dim = X_train_nn.shape[1]  # Should be 100 if using GloVe 100d embeddings
# Determine the number of output classes
num_classes = y_one_hot.shape[1]

# Build the model
model = Sequential([
    Dense(128, input_dim=input_dim, activation='relu'),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

# Display the model's architecture
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [40]:
# Train the model
history = model.fit(X_train_nn, y_train_nn,
                    epochs=20,
                    batch_size=32,
                    validation_split=0.1,
                    verbose=2)

test_loss, test_accuracy = model.evaluate(X_test_nn, y_test_nn, verbose=2)
print(f"Test Loss: {test_loss}")
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/20
209/209 - 1s - 6ms/step - accuracy: 0.4865 - loss: 1.3547 - val_accuracy: 0.7106 - val_loss: 0.8316
Epoch 2/20
209/209 - 0s - 1ms/step - accuracy: 0.6811 - loss: 0.9040 - val_accuracy: 0.7456 - val_loss: 0.6867
Epoch 3/20
209/209 - 0s - 1ms/step - accuracy: 0.7266 - loss: 0.7983 - val_accuracy: 0.7739 - val_loss: 0.6166
Epoch 4/20
209/209 - 0s - 1ms/step - accuracy: 0.7556 - loss: 0.7350 - val_accuracy: 0.7820 - val_loss: 0.5949
Epoch 5/20
209/209 - 0s - 1ms/step - accuracy: 0.7617 - loss: 0.7076 - val_accuracy: 0.7847 - val_loss: 0.5730
Epoch 6/20
209/209 - 0s - 1ms/step - accuracy: 0.7761 - loss: 0.6758 - val_accuracy: 0.7941 - val_loss: 0.5646
Epoch 7/20
209/209 - 0s - 1ms/step - accuracy: 0.7934 - loss: 0.6557 - val_accuracy: 0.7873 - val_loss: 0.5550
Epoch 8/20
209/209 - 0s - 1ms/step - accuracy: 0.7949 - loss: 0.6302 - val_accuracy: 0.8062 - val_loss: 0.5389
Epoch 9/20
209/209 - 0s - 1ms/step - accuracy: 0.7967 - loss: 0.6213 - val_accuracy: 0.8008 - val_loss: 0.5310
E

In [41]:
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

# Adjust the model architecture
model = Sequential([
    Dense(256, input_dim=input_dim, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(num_classes, activation='softmax')
])

# Compile the model with potentially adjusted learning rate
model.compile(loss='categorical_crossentropy',
              optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              metrics=['accuracy'])

# Display the adjusted model's architecture
model.summary()