#### Machine Learning Project: Train Model to predict the author of a phrase
#### Marc McAllister
#### 2024

###### Suggestions are welcome. Thank you.

In [1]:
!pip install tensorflow pandas scikit-learn
!pip install tensorflow
!pip install pandas
!pip install scikit-learn
!pip install nltk
#nltk.download('stopwords')



In [2]:
import tensorflow as tf
from tensorflow.keras import layers, models
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from scipy.sparse import csr_matrix

import seaborn as sns                       #visualisation
import matplotlib.pyplot as plt             #visualisation
import pickle

import pathlib
import os

from nltk.corpus import stopwords



In [3]:
folderpath = "sources"

filenames = list()

for name in os.listdir(folderpath):
    if name.endswith('txt'):
        filenames.append(name)
        
#filenames

In [4]:
# Get and Process Data 

titles = []

df = pd.DataFrame(columns=['Author','Text'])

for fname in filenames:
    samplefilepath = 'sources/' + fname
    sampletext = pathlib.Path(samplefilepath).read_text()

    title = sampletext.split('\n')[0]
    authorname = sampletext.split('\n')[1]

    sampletext = sampletext.replace("\n", " ")
    sampletext_sentences = sampletext.split(".")

    titles.append(title)

    i = len(sampletext_sentences)

    data = {'Author': [authorname]*i,
        'Text': sampletext_sentences}

    dfsub = pd.DataFrame(data, columns=['Author','Text'])
    df = pd.concat([df, dfsub])

#titles
#df

In [5]:
# Start building Model

train_texts, test_texts, train_labels, test_labels = train_test_split(df['Text'], df['Author'], test_size=0.2, random_state=42)

In [6]:
# Tokenize and vectorize text data

vectorizer = CountVectorizer(stop_words='english')
X_train = vectorizer.fit_transform(train_texts)
X_test = vectorizer.transform(test_texts)

In [7]:
# Encode the labels

label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_labels)
y_test = label_encoder.transform(test_labels)

y_test

array([1, 2, 3, ..., 3, 2, 1])

In [8]:
# Build the model

model = models.Sequential([
    layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    layers.Dense(32, activation='relu'),
    layers.Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2024-07-10 20:49:40.649653: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected


In [9]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [10]:
# Convert sparse matrices

X_train_sparse = tf.convert_to_tensor(csr_matrix(X_train).todense(), dtype=tf.float32)
X_val_sparse = tf.convert_to_tensor(csr_matrix(X_val).todense(), dtype=tf.float32)
X_test_sparse = tf.convert_to_tensor(csr_matrix(X_test).todense(), dtype=tf.float32)

In [11]:
# Train the Model

history = model.fit(X_train_sparse, y_train, epochs=10, batch_size=32, validation_data=(X_val_sparse, y_val))

Epoch 1/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.6369 - loss: 0.9114 - val_accuracy: 0.7956 - val_loss: 0.5328
Epoch 2/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8612 - loss: 0.3629 - val_accuracy: 0.7953 - val_loss: 0.5522
Epoch 3/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.8954 - loss: 0.2656 - val_accuracy: 0.7896 - val_loss: 0.5868
Epoch 4/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9033 - loss: 0.2359 - val_accuracy: 0.7896 - val_loss: 0.6336
Epoch 5/10
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.9159 - loss: 0.2031 - val_accuracy: 0.7882 - val_loss: 0.6739
Epoch 6/10
[1m704/704[0m [3

In [12]:
# Evaluate the model

predictions = model.predict(X_test)
y_pred = predictions.argmax(axis=1)
accuracy = accuracy_score(y_test, y_pred)
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m  1/220[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m15s[0m 70ms/step[1m 50/220[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[0m [1m0s[0m 1ms/step  [1m102/220[0m [32m━━━━━━━━━[0m[37m━━━━━━━━━━━[0m [1m0s[0m 1ms/step[1m159/220[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 963us/step[1m214/220[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 953us/step[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step  
Test Accuracy: 76.94%


Suggestions on accuracy?