In [5]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# -------------------------------
# 1. Load the dataset
# -------------------------------
genres = pd.read_csv("/content/movies_genres.csv")
overview = pd.read_csv("/content/movies_overview.csv")

# Merge datasets using a common column (adjust column names as needed)
data = pd.merge(overview, genres, on="id")   # change key if different

# -------------------------------
# 2. Basic preprocessing
# -------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["overview"] = data["overview"].apply(clean_text)

# Drop nulls if any
data = data.dropna(subset=["overview", "genre"])

# -------------------------------
# 3. Prepare labels
# -------------------------------
label_encoder = LabelEncoder()
data["genre_encoded"] = label_encoder.fit_transform(data["genre"])
y = to_categorical(data["genre_encoded"])

# -------------------------------
# 4. Tokenization and Padding
# -------------------------------
max_words = 10000  # vocabulary size
max_len = 200      # max sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data["overview"])
sequences = tokenizer.texts_to_sequences(data["overview"])

X = pad_sequences(sequences, maxlen=max_len)

# -------------------------------
# 5. Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# 6. Build LSTM model
# -------------------------------
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))

# -------------------------------
# 7. Compile model
# -------------------------------
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# -------------------------------
# 8. Train model
# -------------------------------
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

# -------------------------------
# 9. Evaluate model
# -------------------------------
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Accuracy: {acc:.3f}")

# -------------------------------
# 10. Predict genre from new text
# -------------------------------
def predict_genre(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])

# Example prediction
sample_text = "A young wizard discovers his magical heritage on his 11th birthday."
print("Predicted Genre:", predict_genre(sample_text)[0])


KeyError: 'id'

In [4]:
import pandas as pd

genres = pd.read_csv("/content/movies_genres.csv")
overview = pd.read_csv("/content/movies_overview.csv")

print("Genres columns:", genres.columns.tolist())
print("Overview columns:", overview.columns.tolist())


Genres columns: ['id', 'name']
Overview columns: ['title', 'overview', 'genre_ids']


In [8]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

# -------------------------------
# 1. Load the dataset
# -------------------------------
genres = pd.read_csv("/content/movies_genres.csv")
overview = pd.read_csv("/content/movies_overview.csv")

# Convert genre_ids string to Python list
def parse_genre_ids(x):
    try:
        return ast.literal_eval(x) if isinstance(x, str) else []
    except:
        return []

overview["genre_ids"] = overview["genre_ids"].apply(parse_genre_ids)

# Map genre IDs to names
id_to_name = dict(zip(genres["id"], genres["name"]))

def map_genre_names(genre_list):
    return [id_to_name.get(gid, None) for gid in genre_list if gid in id_to_name]

overview["genre_names"] = overview["genre_ids"].apply(map_genre_names)

# Drop rows with empty genre names
overview = overview[overview["genre_names"].map(len) > 0]

# Take the first genre for single-label classification
overview["genre"] = overview["genre_names"].apply(lambda x: x[0])

data = overview[["overview", "genre"]].dropna()

# -------------------------------
# 2. Basic preprocessing
# -------------------------------
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

data["overview"] = data["overview"].apply(clean_text)

# -------------------------------
# 3. Prepare labels
# -------------------------------
label_encoder = LabelEncoder()
data["genre_encoded"] = label_encoder.fit_transform(data["genre"])
y = to_categorical(data["genre_encoded"])

# -------------------------------
# 4. Tokenization and Padding
# -------------------------------
max_words = 10000  # vocabulary size
max_len = 200      # max sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(data["overview"])
sequences = tokenizer.texts_to_sequences(data["overview"])

X = pad_sequences(sequences, maxlen=max_len)

# -------------------------------
# 5. Train-test split
# -------------------------------
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# -------------------------------
# 6. Build LSTM model
# -------------------------------
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(y.shape[1], activation='softmax'))

# -------------------------------
# 7. Compile model
# -------------------------------
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# -------------------------------
# 8. Train model
# -------------------------------
history = model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=64,
    validation_split=0.2,
    verbose=1
)

# -------------------------------
# 9. Evaluate model
# -------------------------------
loss, acc = model.evaluate(X_test, y_test, verbose=1)
print(f"\nTest Accuracy: {acc:.3f}")

# -------------------------------
# 10. Predict genre from new text
# -------------------------------
def predict_genre(text):
    text = clean_text(text)
    seq = tokenizer.texts_to_sequences([text])
    padded = pad_sequences(seq, maxlen=max_len)
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])

# Example prediction
sample_text = "After the death of her abusive husband, Matilde finds her new best friend in Miguel, her young, insecure, and disoriented neighbor."
print("Predicted Genre:", predict_genre(sample_text)[0])


Epoch 1/5




[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 701ms/step - accuracy: 0.1960 - loss: 2.6004 - val_accuracy: 0.1813 - val_loss: 2.3565
Epoch 2/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 678ms/step - accuracy: 0.2065 - loss: 2.3707 - val_accuracy: 0.2456 - val_loss: 2.3373
Epoch 3/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m67s[0m 674ms/step - accuracy: 0.2900 - loss: 2.1567 - val_accuracy: 0.3075 - val_loss: 2.2157
Epoch 4/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 697ms/step - accuracy: 0.4398 - loss: 1.7480 - val_accuracy: 0.3169 - val_loss: 2.3407
Epoch 5/5
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 703ms/step - accuracy: 0.5718 - loss: 1.3679 - val_accuracy: 0.3119 - val_loss: 2.5049
[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 97ms/step - accuracy: 0.3109 - loss: 2.4552

Test Accuracy: 0.311
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0