In [4]:
#!pip install keras tensorflow

In [11]:
import numpy as np
import pandas as pd
import regex as re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
data = pd.read_csv('../datasets/train.txt', sep='\t', header=None, names=['Title', 'Origin', 'Genre', 'Director', 'Plot'])
data.head()

In [None]:
stop_words = ["the", "to", "of", "a", 'and', 'is', 'his', 'in', 'he', 
            'that', 'her', "with", "by", "for", "him", "the", "as", "who",
            "on", "she", "but", "from", "has", "they", "an", "at", "their", "are",
            "into", "he", "out", "it", "up", "be", "was", "when", "not", "them", "which",
            "then", "after", "about", "where", "one", "have", "When", "After", "tells", "him.",
            "back", "She", "will", "while", "all", "two", "In", "had", "been", "They",
            "get", "only", "also", "before", "off", "being", "As", "goes", "takes",
            "this", "other", "take", "tries", "A", "her.", "go", "gets", "can", "man", "so",
            "over", "through", "down", "help", "new", "him,", "now", "comes", "next", "himself",
            "later", "however", "away", "there", "during", "both", "first", "again", "no", "way", "own",
            "some", "another", "more", "becomes", "make", "does", "what", "begins", "meanwhile", "just",
            "asks", "if", "because", "soon", "having", "its", "eventually", "come", "still", "between", "father",
            "house", "finds"
            ]

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.strip(' ')
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

data['Clean_Plot'] = data['Plot'].apply(preprocess_text)

In [None]:
# prints clean plot head

data['Clean_Plot'].head()
data['Clean_Plot'].shape

## Tf-idf will give us the embeddings

In [None]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['Genre'], shuffle=True)
train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)

label_encoder = LabelEncoder()
label_encoder.fit(data['Genre'])

y_train = label_encoder.transform(train_data['Genre'])
y_test = label_encoder.transform(test_data['Genre'])

# TF-IDF vectorization
tfidf = TfidfVectorizer(min_df=20, stop_words=stop_words, ngram_range=(1, 3))
X_train = tfidf.fit_transform(train_data['Clean_Plot'])
X_test = tfidf.transform(test_data['Clean_Plot'])


In [None]:
from sklearn.naive_bayes import MultinomialNB

# adjusting alpha got me way better results to avoid smoothing a lot
nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

## Let's analyze the mispredictions

In [None]:
y_test_decoded = label_encoder.inverse_transform(y_test)
y_pred_decoded = label_encoder.inverse_transform(y_pred)

misclassified_indices = np.where(y_pred != y_test)[0]

mispredictions_df = pd.DataFrame({
    'Plot': test_data['Plot'].iloc[misclassified_indices].values,
    'Clean Plot': test_data['Clean_Plot'].iloc[misclassified_indices].values,
    'True Genre': y_test_decoded[misclassified_indices],
    'Predicted Genre': y_pred_decoded[misclassified_indices]
})

# displays a sample of them
print("\nSample mispredictions:")
mispredictions_df.head()

# writes them to a file
mispredictions_df.to_csv('mispredictions.csv', index=False)

## Testing Time (these won't be the submitted predictions)

In [None]:
test_data = pd.read_csv('../datasets/test_no_labels.txt', 
                        sep='\t', header=None, names=['Title', 'Origin', 'Director', 'Plot'])

test_data.head()

In [None]:
# preprocess exactly the same as the training set was preprocessed
test_data['Clean_Plot'] = test_data['Plot'].apply(preprocess_text)
test_data['Clean_Plot'].head()

In [None]:
# transform the test data using the same TfidfVectorizer
X_test = tfidf.transform(test_data['Clean_Plot']).toarray()

# predict the genres of the test data
y_pred = nb_classifier.predict(X_test)
y_pred = label_encoder.inverse_transform(y_pred)
y_pred[:5]

In [None]:
# save the predictions to a file -- the only content of the file should be a single word with the predicted genre per line

with open('predictions.txt', 'w') as f:
    for genre in y_pred:
        f.write(genre + '\n')