In [2]:
import pandas as pd
import numpy as np
import nltk 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

In [3]:
col_names = ['ID', 'TITLE', 'GENRE', 'DESCRIPTION']
df = pd.read_csv('Genre Classification Dataset/train_data.txt', sep=':::', names=col_names, engine='python')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           54214 non-null  int64 
 1   TITLE        54214 non-null  object
 2   GENRE        54214 non-null  object
 3   DESCRIPTION  54214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.7+ MB


In [5]:
df["DESCRIPTION"] = df["DESCRIPTION"].str.translate(str.maketrans('', '', string.punctuation))

In [6]:
df["DESCRIPTION"] = df["DESCRIPTION"].str.lower()

In [7]:
df["DESCRIPTION"]

0         listening in to a conversation between his do...
1         a brother and sister with a past incestuous r...
2         as the bus empties the students for their fie...
3         to help their unemployed father make ends mee...
4         the films title refers not only to the unreco...
                               ...                        
54209     this shortlived nbc live sitcom centered on b...
54210     the next generation of exploitation the siste...
54211     ze bestaan echt is a standup comedy about gro...
54212     walter and vivian live in the country and hav...
54213     on labor day weekend 1935 the most intense hu...
Name: DESCRIPTION, Length: 54214, dtype: object

In [8]:
stemmer = nltk.stem.PorterStemmer()
def tokenization(text):
    token = nltk.word_tokenize(text)
    a = [stemmer.stem(w) for w in token]
    return " ".join(a)

In [9]:
df["DESCRIPTION"] = df["DESCRIPTION"].apply(lambda x: tokenization(x))

In [10]:
df['GENRE'] = df['GENRE'].str.translate(str.maketrans('', '', string.punctuation))
df['GENRE'] = df['GENRE'].str.lower()
df['GENRE'] = df['GENRE'].apply(lambda x: tokenization(x))

0        listen in to a convers between hi doctor and p...
1        a brother and sister with a past incestu relat...
2        as the bu empti the student for their field tr...
3        to help their unemploy father make end meet ed...
4        the film titl refer not onli to the unrecov bo...
                               ...                        
54209    thi shortliv nbc live sitcom center on bonino ...
54210    the next gener of exploit the sister of kapa b...
54211    ze bestaan echt is a standup comedi about grow...
54212    walter and vivian live in the countri and have...
54213    on labor day weekend 1935 the most intens hurr...
Name: DESCRIPTION, Length: 54214, dtype: object

In [3]:
lemmatizer = WordNetLemmatizer()
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = nltk.word_tokenize(text)
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize the words
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join the words back into a single string
    text = ' '.join(words)
    
    return text

In [4]:
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')


In [5]:

df['DESCRIPTION'] = df['DESCRIPTION'].apply(preprocess_text)

Text Representation

In [11]:
df_tfidf = df.copy()
df_embedding = df.copy()

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2), analyzer="word", stop_words="english")

# Fit and transform the 'DESCRIPTION' column
tfidf_matrix = vectorizer.fit_transform(df_tfidf['DESCRIPTION'])

In [None]:
# import spacy
# # Load the large English model. This model includes word vectors.
# nlp = spacy.load('en_core_web_lg')

# # Create a vector for each movie description
# spacy_matrix = df['DESCRIPTION'].apply(lambda x: nlp(x).vector)


model training

In [13]:
# Load the test data
test_data = pd.read_csv('Genre Classification Dataset/test_data_solution.txt', sep=':::', names=['ID', 'TITLE', 'GENRE','DESCRIPTION'], engine='python')

# Preprocess the 'DESCRIPTION' column
# test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].str.translate(str.maketrans('', '', string.punctuation))
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].str.lower()
test_data["DESCRIPTION"] = test_data["DESCRIPTION"].apply(lambda x: tokenization(x))


# Transform the 'DESCRIPTION' column into TF-IDF vectors
tfidf_matrix_test = vectorizer.transform(test_data['DESCRIPTION'])


In [14]:
test_data

Unnamed: 0,ID,TITLE,GENRE,DESCRIPTION
0,1,Edgar's Lunch (1998),thriller,lr brane love hi life hi car hi apart hi job b...
1,2,La guerra de papá (1977),comedy,spain march 1964 quico is a veri naughti child...
2,3,Off the Beaten Track (2010),documentary,one year in the life of albin and hi famili of...
3,4,Meu Amigo Hindu (2015),drama,hi father ha die he hasnt spoken with hi broth...
4,5,Er nu zhai (1955),drama,befor he wa known intern as a martial art supe...
...,...,...,...,...
54195,54196,"""Tales of Light & Dark"" (2013)",horror,cover multipl genr tale of light dark is an an...
54196,54197,Der letzte Mohikaner (1965),western,as alic and cora munro attempt to find their f...
54197,54198,Oliver Twink (2007),adult,a movi 169 year in the make oliv twist the art...
54198,54199,Slipstream (1973),drama,popular but mysteri rock dj mike mallard askew...


In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

In [17]:
lr = LogisticRegression(max_iter=200)
lr.fit(tfidf_matrix,df_tfidf['GENRE'])

lr_predictions_train = lr.predict(tfidf_matrix)
lr_predictions_test = lr.predict(tfidf_matrix_test)


accuracy = accuracy_score(df_tfidf['GENRE'], lr_predictions_train)
print(f'LR Accuracy: {accuracy}')
accuracy = accuracy_score(test_data['GENRE'], lr_predictions_test)
print(f'LR Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], lr_predictions_test, zero_division=1))

LR Accuracy: 0.5915261740509832
LR Accuracy: 0.5511623616236162
               precision    recall  f1-score   support

      action        0.38      0.24      0.29      1314
       adult        0.59      0.24      0.34       590
   adventure        0.41      0.12      0.18       775
   animation        0.38      0.08      0.13       498
   biography        1.00      0.00      0.00       264
      comedy        0.49      0.52      0.50      7446
       crime        0.20      0.04      0.07       505
 documentary        0.65      0.82      0.73     13096
       drama        0.52      0.74      0.61     13612
      family        0.37      0.09      0.15       783
     fantasy        0.33      0.03      0.06       322
   game-show        0.78      0.54      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.57      0.51      0.54      2204
       music        0.60      0.47      0.53       731
     musical        0.10      0.01      0.03       276


In [27]:
svm = LinearSVC()
svm.fit(tfidf_matrix, df_tfidf['GENRE'])

# Make predictions on the test set
svm_predictions = svm.predict(tfidf_matrix_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_data['GENRE'], svm_predictions)
print(f'SVM Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], svm_predictions, zero_division=1))

SVM Accuracy: 0.5493357933579336
               precision    recall  f1-score   support

      action        0.38      0.21      0.27      1314
       adult        0.46      0.32      0.38       590
   adventure        0.40      0.08      0.14       775
   animation        0.30      0.09      0.13       498
   biography        1.00      0.00      0.00       264
      comedy        0.49      0.50      0.50      7446
       crime        0.23      0.05      0.08       505
 documentary        0.65      0.83      0.73     13096
       drama        0.53      0.73      0.61     13612
      family        0.36      0.08      0.13       783
     fantasy        0.24      0.04      0.07       322
   game-show        0.61      0.60      0.61       193
     history        0.00      0.00      0.00       243
      horror        0.53      0.55      0.54      2204
       music        0.54      0.50      0.52       731
     musical        0.17      0.04      0.06       276
     mystery        0.14      0

In [32]:
svc = SVC(kernel='poly')
svc.fit(tfidf_matrix, df_tfidf['GENRE'])

# Make predictions on the test set
svc_predictions = svc.predict(tfidf_matrix_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_data['GENRE'], svm_predictions)
print(f'SVM Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], svm_predictions, zero_division=1))

SVM Accuracy: 0.5443542435424354
               precision    recall  f1-score   support

      action        0.36      0.19      0.25      1314
       adult        0.43      0.29      0.35       590
   adventure        0.33      0.06      0.10       775
   animation        0.23      0.05      0.09       498
   biography        0.00      0.00      0.00       264
      comedy        0.48      0.50      0.49      7446
       crime        0.20      0.04      0.07       505
 documentary        0.64      0.83      0.72     13096
       drama        0.52      0.73      0.61     13612
      family        0.37      0.08      0.12       783
     fantasy        0.20      0.02      0.04       322
   game-show        0.63      0.57      0.60       193
     history        0.00      0.00      0.00       243
      horror        0.52      0.53      0.52      2204
       music        0.55      0.50      0.52       731
     musical        0.13      0.03      0.05       276
     mystery        0.27      0

In [28]:
# Train the model
nb = MultinomialNB()
nb.fit(tfidf_matrix, df_tfidf['GENRE'])

# Make predictions on the test set
nb_predictions = nb.predict(tfidf_matrix_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_data['GENRE'], nb_predictions)
print(f'Naive Bayes Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], nb_predictions, zero_division=1))


Naive Bayes Accuracy: 0.4886162361623616
               precision    recall  f1-score   support

      action        0.51      0.05      0.10      1314
       adult        1.00      0.01      0.01       590
   adventure        0.50      0.00      0.01       775
   animation        0.33      0.00      0.00       498
   biography        1.00      0.00      0.00       264
      comedy        0.51      0.32      0.39      7446
       crime        0.33      0.00      0.01       505
 documentary        0.53      0.86      0.66     13096
       drama        0.43      0.81      0.56     13612
      family        0.33      0.00      0.01       783
     fantasy        1.00      0.00      0.00       322
   game-show        0.94      0.34      0.50       193
     history        1.00      0.00      0.00       243
      horror        0.69      0.22      0.33      2204
       music        0.67      0.21      0.32       731
     musical        1.00      0.00      0.00       276
     mystery        1.0

In [33]:
from sklearn.ensemble import RandomForestClassifier


rf = RandomForestClassifier()
rf.fit(tfidf_matrix, df_tfidf['GENRE'])

# Make predictions on the test set
rf_predictions = rf.predict(tfidf_matrix_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_data['GENRE'], rf_predictions)
print(f'Random Forest Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], rf_predictions, zero_division=1))

Random Forest Accuracy: 0.48771217712177123
               precision    recall  f1-score   support

      action        0.46      0.01      0.02      1314
       adult        0.53      0.04      0.08       590
   adventure        0.41      0.01      0.02       775
   animation        0.25      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.46      0.32      0.38      7446
       crime        1.00      0.01      0.01       505
 documentary        0.56      0.85      0.67     13096
       drama        0.42      0.80      0.55     13612
      family        0.75      0.02      0.03       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.71      0.50      0.59       193
     history        0.00      0.00      0.00       243
      horror        0.58      0.20      0.29      2204
       music        0.61      0.22      0.32       731
     musical        0.55      0.02      0.04       276
     mystery        

In [32]:
dt = DecisionTreeClassifier()
dt.fit(tfidf_matrix, df_tfidf['GENRE'])

# Make predictions on the test set
dt_predictions = dt.predict(tfidf_matrix_test)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(test_data['GENRE'], dt_predictions)
print(f'Decision Tree Accuracy: {accuracy}')
print(classification_report(test_data['GENRE'], dt_predictions, zero_division=1))

Decision Tree Accuracy: 0.3463468634686347
               precision    recall  f1-score   support

      action        0.10      0.08      0.09      1314
       adult        0.10      0.07      0.08       590
   adventure        0.06      0.05      0.05       775
   animation        0.04      0.02      0.03       498
   biography        0.02      0.01      0.01       264
      comedy        0.29      0.31      0.30      7446
       crime        0.04      0.03      0.04       505
 documentary        0.54      0.58      0.56     13096
       drama        0.40      0.45      0.42     13612
      family        0.07      0.05      0.06       783
     fantasy        0.03      0.02      0.02       322
   game-show        0.40      0.41      0.41       193
     history        0.01      0.01      0.01       243
      horror        0.23      0.21      0.22      2204
       music        0.25      0.20      0.22       731
     musical        0.04      0.04      0.04       276
     mystery        0