#### Machine Learning Project: Train Model to predict the author of a phrase
#### Marc McAllister
#### 2024

###### Suggestions are welcome. Thank you.

In [None]:
!pip install tensorflow pandas scikit-learn
!pip install tensorflow
!pip install pandas
!pip install scikit-learn
!pip install nltk
nltk.download('stopwords')

In [1]:
import tensorflow as tf
from tensorflow import keras
from keras import layers, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
import pickle

from nltk.tag import pos_tag

import pathlib
import os

from nltk.corpus import stopwords

2024-07-18 17:26:08.203851: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:479] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 17:26:08.219441: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:10575] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 17:26:08.219465: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1442] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-18 17:26:08.229333: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
folderpath = "sources"

filenames = list()

for name in os.listdir(folderpath):
    if name.endswith('txt'):
        filenames.append(name)
        

In [3]:
# Get and Process Data 

titles = []

df = pd.DataFrame(columns=['Author','Text'])

for fname in filenames:
    samplefilepath = 'sources/' + fname
    sampletext = pathlib.Path(samplefilepath).read_text()

    title = sampletext.split('\n')[0]
    authorname = sampletext.split('\n')[1]

    sampletext = sampletext.replace("\n", " ")
    sampletext_sentences = sampletext.split(".")

    titles.append(title)

    i = len(sampletext_sentences)

    data = {'Author': [authorname]*i,
        'Text': sampletext_sentences}

    dfsub = pd.DataFrame(data, columns=['Author','Text'])
    #dfsub = dfsub[0:1000]      #For Small
    df = pd.concat([df, dfsub])


#df.to_csv('Text_Author.csv', index=False)


In [4]:
# Start building Model

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Author'], test_size=0.2, random_state=42)

In [5]:
# Tokenize and vectorize text data

vectorizer = CountVectorizer()
x_train = vectorizer.fit_transform(train_texts)
x_test = vectorizer.transform(test_texts)

In [6]:
# Encode the labels

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)


In [7]:
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [8]:
# Convert sparse matrices

x_train_sparse = tf.convert_to_tensor(csr_matrix(x_train).todense(), dtype=tf.float32)
x_val_sparse = tf.convert_to_tensor(csr_matrix(x_val).todense(), dtype=tf.float32)
x_test_sparse = tf.convert_to_tensor(csr_matrix(x_test).todense(), dtype=tf.float32)

2024-07-18 17:26:19.290337: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [9]:
# Build the model

model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [10]:
# Train the Model

history = model.fit(x_train_sparse, y_train, epochs=10, batch_size=32, validation_data=(x_val_sparse, y_val))

Epoch 1/10
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5908 - loss: 1.0636 - val_accuracy: 0.7662 - val_loss: 0.6154
Epoch 2/10
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8334 - loss: 0.4511 - val_accuracy: 0.7743 - val_loss: 0.6099
Epoch 3/10
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8720 - loss: 0.3402 - val_accuracy: 0.7786 - val_loss: 0.6441
Epoch 4/10
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.8999 - loss: 0.2672 - val_accuracy: 0.7773 - val_loss: 0.6919
Epoch 5/10
[1m847/847[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.9162 - loss: 0.2321 - val_accuracy: 0.7694 - val_loss: 0.7553
Epoch 6/10
[1m847/847[0m [3

In [11]:
# Evaluate the model

predictions = model.predict(x_test)
y_pred = predictions.argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m265/265[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Test Accuracy: 76.83%


In [12]:
# Save the model

model.save('tensorflow_detection_model.keras')

In [13]:
# Test the model

test_text = 'ceaselessly into the past'

print(test_text)

tvector = vectorizer.transform([test_text])
tsparse = tf.convert_to_tensor(tvector.todense(), dtype=tf.float32)
pred = model.predict(tsparse)

print(pred)
print(np.argmax(pred))

pred_label_index = np.argmax(pred)
pred_label = label_encoder.classes_[pred_label_index]

print(pred_label)

ceaselessly into the past
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[[2.2101817e-03 9.4676441e-01 5.7903113e-04 5.1355992e-06 5.0441261e-02]]
1
F. Scott Fitzgerald


Suggestions on accuracy?