In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import LinearSVC


In [2]:
train_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/train_data.txt"
train_data = pd.read_csv(train_path, sep=':::', names=['Title', 'Genre', 'Description'], engine='python')
train_data


Unnamed: 0,Title,Genre,Description
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54210,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54211,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54212,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54213,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


In [3]:
test_path = "/kaggle/input/genre-classification-dataset-imdb/Genre Classification Dataset/test_data.txt"
test_data = pd.read_csv(test_path, sep=':::', names=['Id', 'Title', 'Description'], engine='python')
test_data


Unnamed: 0,Id,Title,Description
0,1,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,2,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,3,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,4,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,5,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,54197,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,54198,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,54199,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


In [4]:
train_data.describe()


Unnamed: 0,Title,Genre,Description
count,54214,54214,54214
unique,54214,27,54086
top,Nature's Fury: Storm of the Century (2006),drama,Grammy - music award of the American academy ...
freq,1,13613,12


In [5]:
test_data.describe()


Unnamed: 0,Id
count,54200.0
mean,27100.5
std,15646.336632
min,1.0
25%,13550.75
50%,27100.5
75%,40650.25
max,54200.0


In [6]:
# Ensure necessary NLTK data packages are downloaded
nltk.download('stopwords')
nltk.download('punkt')

# Initialize stemmer and stopwords
stemmer = LancasterStemmer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    words = nltk.word_tokenize(text)
    # Remove stop words and stem
    words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words]
    return ' '.join(words)

# Apply preprocessing to the description columns
train_data['Description'] = train_data['Description'].apply(preprocess_text)
test_data['Description'] = test_data['Description'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform on train data
X_train = tfidf_vectorizer.fit_transform(train_data['Description'])

# Transform test data
X_test = tfidf_vectorizer.transform(test_data['Description'])

# Extract target variable
y_train = train_data['Genre']


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [9]:
# Initialize and train the SVM classifier
svm_classifier = LinearSVC()
svm_classifier.fit(X_train, y_train)


In [10]:
# Validate the model
y_val_pred = svm_classifier.predict(X_val)
print("SVM Validation Classification Report:\n", classification_report(y_val, y_val_pred))


SVM Validation Classification Report:
                precision    recall  f1-score   support

      action        0.39      0.31      0.35       263
       adult        0.63      0.39      0.48       112
   adventure        0.32      0.18      0.23       139
   animation        0.35      0.18      0.24       104
   biography        0.00      0.00      0.00        61
      comedy        0.51      0.56      0.54      1443
       crime        0.12      0.04      0.06       107
 documentary        0.69      0.80      0.74      2659
       drama        0.55      0.70      0.61      2697
      family        0.34      0.17      0.23       150
     fantasy        0.30      0.04      0.07        74
   game-show        0.81      0.62      0.70        40
     history        0.00      0.00      0.00        45
      horror        0.59      0.61      0.60       431
       music        0.53      0.51      0.52       144
     musical        0.06      0.02      0.03        50
     mystery        0.00 

In [11]:
# Predict genre labels for the test data
y_test_pred = svm_classifier.predict(X_test)

# You can now use y_test_pred for further analysis or submission to Kaggle
print("Predicted Genres for Test Data (sample):", y_test_pred[:10])  # Print first 10 predictions


Predicted Genres for Test Data (sample): [' drama ' ' drama ' ' documentary ' ' drama ' ' action ' ' short '
 ' drama ' ' comedy ' ' documentary ' ' drama ']
