In [8]:
import csv
import os
import nltk
from nltk.corpus.reader import CategorizedPlaintextCorpusReader

# Load NLTK resources (stopwords, punkt tokenizer)
nltk.download('stopwords')
nltk.download('punkt')

# Path to your CSV file and column indices for text and category
csv_file_path = 'train_data.csv'
text_column_idx = 3  # Specify the column index for text
category_column_idx = 2  # Specify the column index for category

# Read the CSV file and extract text and categories
documents = []
categories = []

with open(csv_file_path, 'r', newline='', encoding='utf-8') as csvfile:
    csv_reader = csv.reader(csvfile)
    for row in csv_reader:
        text = row[text_column_idx]
        category = row[category_column_idx]
        documents.append(text)
        categories.append(category)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/achakraborty/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/achakraborty/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [9]:
categories[0]

'drama'

In [10]:
import re

# Function to sanitize the filename by removing non-alphanumeric characters
def sanitize_filename(filename):
    # Remove all non-alphanumeric characters (except alphabets and numbers)
    return re.sub(r'[^a-zA-Z0-9]', '', filename)

# Directory to store the corpus files
corpus_dir = '/Users/achakraborty/projects/trying_nlp/corpus'

filename_to_category = {}
# Write the documents to separate text files with their categories as filenames
for idx, (document, category) in enumerate(zip(documents, categories)):
    sanitized_category = sanitize_filename(category)
    filename = f"{sanitized_category}_{idx}.txt"
    file_path = os.path.join(corpus_dir, filename)
    filename_to_category[filename] = category
    with open(file_path, 'w', encoding='utf-8') as file:
        file.write(document)


In [11]:
filename_to_category


{'drama_0.txt': 'drama',
 'thriller_1.txt': 'thriller',
 'adult_2.txt': 'adult',
 'drama_3.txt': 'drama',
 'drama_4.txt': 'drama',
 'documentary_5.txt': 'documentary',
 'comedy_6.txt': 'comedy',
 'crime_7.txt': 'crime',
 'realitytv_8.txt': 'reality-tv',
 'horror_9.txt': 'horror',
 'documentary_10.txt': 'documentary',
 'drama_11.txt': 'drama',
 'documentary_12.txt': 'documentary',
 'thriller_13.txt': 'thriller',
 'drama_14.txt': 'drama',
 'drama_15.txt': 'drama',
 'comedy_16.txt': 'comedy',
 'documentary_17.txt': 'documentary',
 'sport_18.txt': 'sport',
 'animation_19.txt': 'animation',
 'drama_20.txt': 'drama',
 'comedy_21.txt': 'comedy',
 'comedy_22.txt': 'comedy',
 'drama_23.txt': 'drama',
 'action_24.txt': 'action',
 'fantasy_25.txt': 'fantasy',
 'short_26.txt': 'short',
 'scifi_27.txt': 'sci-fi',
 'thriller_28.txt': 'thriller',
 'documentary_29.txt': 'documentary',
 'horror_30.txt': 'horror',
 'documentary_31.txt': 'documentary',
 'action_32.txt': 'action',
 'documentary_33.txt': '

In [12]:
from nltk.corpus.reader.plaintext import PlaintextCorpusReader

# Manually create the PlaintextCorpusReader object
corpus = PlaintextCorpusReader(corpus_dir, r'.*\.txt', encoding='utf-8')

# Manually categorize the files
fileids = corpus.fileids()

categories_for_files = [filename_to_category[os.path.basename(fileid)] for fileid in fileids]

# Now you can use corpus.words(), corpus.categories(), etc.
# For example:
print(categories_for_files)  # Print categories
print(corpus.words(fileids=fileids[0]))  # Print words in the first file

['action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action', 'action',

In [13]:
documents = [(list(corpus.words(fileid)), filename_to_category[fileid])
             for fileid in fileids]

In [14]:
documents

[(['Mirai',
   'a',
   'high',
   'school',
   'girl',
   'is',
   'transferred',
   'to',
   'an',
   'upscale',
   'private',
   'girls',
   'school',
   '.',
   'She',
   'soon',
   'finds',
   'out',
   'the',
   'school',
   'not',
   'only',
   'cultivates',
   'young',
   'minds',
   'but',
   'also',
   'supplies',
   'young',
   'bodies',
   'to',
   'rich',
   'politicians',
   '.',
   'Moreover',
   'the',
   'school',
   "'",
   's',
   'headmaster',
   'ripped',
   'apart',
   'her',
   'family',
   '.',
   'She',
   'undergoes',
   'special',
   'training',
   'and',
   'fights',
   'her',
   'way',
   'to',
   'sweet',
   'revenge',
   '!',
   'Mirai',
   'gets',
   'transferred',
   'to',
   'a',
   'private',
   'all',
   '-',
   'girl',
   "'",
   's',
   'school',
   'where',
   'she',
   'soon',
   'discovers',
   'it',
   'not',
   'only',
   'educates',
   'girls',
   'it',
   'also',
   'pimps',
   'them',
   'out',
   'to',
   'horny',
   'politicians',
   '.',


In [15]:
import random
random.shuffle(documents)

In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [17]:
def preprocess_text(words):
    words = [lemmatizer.lemmatize(word) for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words)

In [34]:
X = [preprocess_text(words) for words, category in documents]
y = [category for words, category in documents]

In [35]:
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
vocab_size = len(tokenizer.word_index) + 1
encoded_texts = tokenizer.texts_to_sequences(X)

label_categories = list(set(y))
label_encoder = {label: index for index, label in enumerate(label_categories)}
encoded_labels = to_categorical([label_encoder[label] for label in y])


In [36]:
from keras.preprocessing.sequence import pad_sequences

max_length = 1000
padded_text = pad_sequences(encoded_texts, maxlen=max_length, padding='post')

In [37]:
padded_text[0]

array([ 3274, 15125,    14,   692,    40,  1343,  1107,   230,  3029,
         109,   360,   178,  1972,   209,   477,    16,  3550,    35,
         230,   134,  1057,  2572,   398,  7810, 18595, 45603,  3274,
         178,    15,   551,   360,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [38]:
encoded_texts[0]

[3274,
 15125,
 14,
 692,
 40,
 1343,
 1107,
 230,
 3029,
 109,
 360,
 178,
 1972,
 209,
 477,
 16,
 3550,
 35,
 230,
 134,
 1057,
 2572,
 398,
 7810,
 18595,
 45603,
 3274,
 178,
 15,
 551,
 360]

In [39]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(padded_text, encoded_labels, test_size=0.2, random_state=42) 

In [40]:
X_test

array([[ 1598,   523,  1254, ...,     0,     0,     0],
       [18957,    88,     3, ...,     0,     0,     0],
       [  776,  1757,   775, ...,     0,     0,     0],
       ...,
       [  518,   141,     4, ...,     0,     0,     0],
       [    5,  7636,    43, ...,     0,     0,     0],
       [   25,   290,   104, ...,     0,     0,     0]], dtype=int32)

In [41]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

embedding_dim = 50
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=100))
model.add(Dense(units=len(label_categories), activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [43]:
num_of_epochs = 5
batch_size = 32
model.fit(X_train, y_train, epochs=num_of_epochs, batch_size=batch_size, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x30fb71370>

In [50]:
import numpy as np


preprocessed_text = preprocess_text("With never-before-seen footage, this docuseries follows David Beckham's meteoric rise from humble beginnings to global football stardom.")
input_sequences = tokenizer.texts_to_sequences([preprocessed_text])

input_sequences_padded = pad_sequences(input_sequences, maxlen=max_length, padding='post')

prediction = model.predict(input_sequences_padded)
predicted_sentiment_label = label_categories[np.argmax(prediction)]

prediction



array([[0.21552692, 0.00686872, 0.00805735, 0.01666412, 0.00465406,
        0.04660917, 0.28216437, 0.00634269, 0.00965862, 0.024736  ,
        0.00626062, 0.01236244, 0.00555776, 0.01132023, 0.01171714,
        0.09308979, 0.00375085, 0.00782488, 0.01163792, 0.01764056,
        0.1315897 , 0.00237964, 0.02698374, 0.01619187, 0.00306579,
        0.00394071, 0.01340432]], dtype=float32)

In [51]:
np.argmax(prediction)

6