In [10]:
#importing all necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [11]:
#Reading the training data from the system using pandas 
# As the data is in the form of ID ::: TITLE ::: GENRE ::: DESCRIPTION we use sep parameter 
train=pd.read_csv('/home/harsha/Desktop/ML/Genre Classification Dataset/train_data.txt',sep=':::',header=None,names=['ID','Movie','Genre','Description'],engine='python')
# Dropping the ID as it is not necessary for the training 
train.drop(['ID'],axis=1,inplace=True)
display(train)
print(train.isna().sum())
print(train.duplicated().sum())

Unnamed: 0,Movie,Genre,Description
0,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
1,Cupid (1997),thriller,A brother and sister with a past incestuous r...
2,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
3,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
4,The Unrecovered (2007),drama,The film's title refers not only to the un-re...
...,...,...,...
54209,"""Bonino"" (1953)",comedy,This short-lived NBC live sitcom centered on ...
54210,Dead Girls Don't Cry (????),horror,The NEXT Generation of EXPLOITATION. The sist...
54211,Ronald Goedemondt: Ze bestaan echt (2008),documentary,"Ze bestaan echt, is a stand-up comedy about g..."
54212,Make Your Own Bed (1944),comedy,Walter and Vivian live in the country and hav...


Movie          0
Genre          0
Description    0
dtype: int64
0


In [12]:
# All types of genres 
display(train['Genre'].unique())
display(train.info())

array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54214 entries, 0 to 54213
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Movie        54214 non-null  object
 1   Genre        54214 non-null  object
 2   Description  54214 non-null  object
dtypes: object(3)
memory usage: 1.2+ MB


None

In [13]:
#splitting the data for traing and testing the data from the train data set only 
x_train,x_test,y_train,y_test=train_test_split(train['Description'],train['Genre'],test_size=0.2,random_state=42)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(43371,)
(10843,)
(43371,)
(10843,)


In [14]:
#Importing Count Vectorizer for conversion of text into numbers
from sklearn.feature_extraction.text import CountVectorizer
#Using stop words for removing the most commonly used words for better training
from nltk.corpus import stopwords
import nltk
#nltk.download('stopwords')
stop_words = stopwords.words('english')
cv=CountVectorizer(stop_words=stop_words,max_df=0.8,min_df=2,max_features=10000)
#For sklearn classifiers features should be given as number input and for labels it directly accepts strings it auto encodes internally 
x_train_vector=cv.fit_transform(x_train)
x_test_vector=cv.transform(x_test)
display(x_train_vector.shape)
print(stop_words)

(43371, 10000)

['a', 'about', 'above', 'after', 'again', 'against', 'ain', 'all', 'am', 'an', 'and', 'any', 'are', 'aren', "aren't", 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between', 'both', 'but', 'by', 'can', 'couldn', "couldn't", 'd', 'did', 'didn', "didn't", 'do', 'does', 'doesn', "doesn't", 'doing', 'don', "don't", 'down', 'during', 'each', 'few', 'for', 'from', 'further', 'had', 'hadn', "hadn't", 'has', 'hasn', "hasn't", 'have', 'haven', "haven't", 'having', 'he', "he'd", "he'll", 'her', 'here', 'hers', 'herself', "he's", 'him', 'himself', 'his', 'how', 'i', "i'd", 'if', "i'll", "i'm", 'in', 'into', 'is', 'isn', "isn't", 'it', "it'd", "it'll", "it's", 'its', 'itself', "i've", 'just', 'll', 'm', 'ma', 'me', 'mightn', "mightn't", 'more', 'most', 'mustn', "mustn't", 'my', 'myself', 'needn', "needn't", 'no', 'nor', 'not', 'now', 'o', 'of', 'off', 'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 're', 's', 'same', 'shan', "shan't", 'she

In [15]:
#We can use either the Count vectorizer or the TFID vectorizer for convertion of text to numbers
'''from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer(stop_words='english',ngram_range=(1,2),lowercase=True)
x_train_vector=tf.fit_transform(x_train)
x_test_vector=tf.transform(x_test)'''

"from sklearn.feature_extraction.text import TfidfVectorizer\ntf=TfidfVectorizer(stop_words='english',ngram_range=(1,2),lowercase=True)\nx_train_vector=tf.fit_transform(x_train)\nx_test_vector=tf.transform(x_test)"

In [16]:
# Logistic Regression Model
model_LR=LogisticRegression(solver='lbfgs',n_jobs=-1,max_iter=1000)
model_LR.fit(x_train_vector,y_train)
y_pred_LR=model_LR.predict(x_test_vector)
print(accuracy_score(y_test,y_pred_LR))
print(classification_report(y_test,y_pred_LR,zero_division=0))

0.5472655169233607
               precision    recall  f1-score   support

      action        0.38      0.31      0.34       263
       adult        0.70      0.38      0.50       112
   adventure        0.31      0.24      0.27       139
   animation        0.35      0.15      0.21       104
   biography        0.00      0.00      0.00        61
      comedy        0.49      0.56      0.52      1443
       crime        0.29      0.13      0.18       107
 documentary        0.70      0.75      0.72      2659
       drama        0.55      0.65      0.60      2697
      family        0.27      0.19      0.22       150
     fantasy        0.17      0.04      0.07        74
   game-show        0.83      0.60      0.70        40
     history        0.00      0.00      0.00        45
      horror        0.63      0.60      0.62       431
       music        0.56      0.53      0.54       144
     musical        0.33      0.10      0.15        50
     mystery        0.18      0.05      0.08 

In [17]:
# Naives bayes model 
model_NB=MultinomialNB()
model_NB.fit(x_train_vector,y_train)
y_pred_NB=model_NB.predict(x_test_vector)
print(accuracy_score(y_test,y_pred_NB))
print(classification_report(y_test,y_pred_NB,zero_division=0))

0.560638199760214
               precision    recall  f1-score   support

      action        0.36      0.46      0.40       263
       adult        0.59      0.57      0.58       112
   adventure        0.28      0.27      0.27       139
   animation        0.34      0.24      0.28       104
   biography        0.00      0.00      0.00        61
      comedy        0.56      0.55      0.55      1443
       crime        0.17      0.11      0.13       107
 documentary        0.73      0.75      0.74      2659
       drama        0.62      0.61      0.62      2697
      family        0.32      0.21      0.25       150
     fantasy        0.15      0.04      0.06        74
   game-show        0.72      0.65      0.68        40
     history        0.05      0.02      0.03        45
      horror        0.55      0.69      0.61       431
       music        0.34      0.76      0.47       144
     musical        0.16      0.06      0.09        50
     mystery        0.08      0.02      0.03  

In [18]:
# Random Forest classifier model 
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_jobs=-1,class_weight='balanced')
rf.fit(x_train_vector,y_train)
y_pred_rf=rf.predict(x_test_vector)
print(accuracy_score(y_test,y_pred_rf))
print(classification_report(y_test,y_pred_rf,zero_division=0))

0.49912385871068893
               precision    recall  f1-score   support

      action        0.56      0.02      0.04       263
       adult        0.51      0.31      0.39       112
   adventure        0.62      0.14      0.23       139
   animation        0.40      0.08      0.13       104
   biography        0.00      0.00      0.00        61
      comedy        0.50      0.27      0.35      1443
       crime        1.00      0.01      0.02       107
 documentary        0.59      0.82      0.68      2659
       drama        0.42      0.83      0.56      2697
      family        0.80      0.03      0.05       150
     fantasy        0.00      0.00      0.00        74
   game-show        0.74      0.57      0.65        40
     history        0.00      0.00      0.00        45
      horror        0.52      0.26      0.34       431
       music        0.57      0.40      0.47       144
     musical        0.12      0.02      0.03        50
     mystery        0.00      0.00      0.00

In [19]:
# I have been given the test_data and test_data_solution txt files 
# I manually checked the accuracy of the model by predicting the genres of test_data and comparing the accuracy with test_data_solution['Genre] 
test=pd.read_csv('/home/harsha/Desktop/ML/Genre Classification Dataset/test_data.txt',sep=':::',header=None,names=['ID','Movie','Description'],engine='python')
test_solution=pd.read_csv('/home/harsha/Desktop/ML/Genre Classification Dataset/test_data_solution.txt',sep=':::',header=None,names=['ID','Movie','Genre','Description'],engine='python')
test_solution.drop(columns=['ID'],axis=1,inplace=True)
test.drop(columns=['ID'],axis=1,inplace=True)
display(test)
x_test_pred=cv.transform(test['Description'])
y_pred_test=model_LR.predict(x_test_pred)
y_pred_solution=test_solution['Genre']
print(accuracy_score(y_pred_solution,y_pred_test))

Unnamed: 0,Movie,Description
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...
...,...,...
54195,"""Tales of Light & Dark"" (2013)","Covering multiple genres, Tales of Light & Da..."
54196,Der letzte Mohikaner (1965),As Alice and Cora Munro attempt to find their...
54197,Oliver Twink (2007),A movie 169 years in the making. Oliver Twist...
54198,Slipstream (1973),"Popular, but mysterious rock D.J Mike Mallard..."


0.5397970479704797


In [20]:
print("Accuracy of all the models ")
print("LogisticRegression : ",accuracy_score(y_test,y_pred_LR))
print("Naive Bayes : ",accuracy_score(y_test,y_pred_NB))
print("Random Forest : ",accuracy_score(y_test,y_pred_rf))

Accuracy of all the models 
LogisticRegression :  0.5472655169233607
Naive Bayes :  0.560638199760214
Random Forest :  0.49912385871068893


In [22]:
#Evaluating the model with Giving some random descriptions of some movies to find the Genres 
# For Navies Bayes it Provies maximum accuracy and correct genre mostly 
# The below code is used to predict the genres of the movies using the Naive Bayes model
movie_descriptions = [
    "A retired CIA agent must use all his skills to rescue his daughter who has been kidnapped by a mysterious criminal organization across Europe.",
    "Two college dropouts start a food truck and bumble their way through bizarre events, serving up tacos and laughs in equal measure.",
    "A small-town waitress and a big-city lawyer fall in love while battling a property dispute, only to discover they have a shared past.",
    "In a dystopian future, a hacker discovers a hidden reality where machines have enslaved humanity, and he must lead a rebellion.",
    "A young couple moves into a farmhouse only to uncover a sinister presence in the basement that preys on their fears.",
    "A young orphan discovers a magical world hidden behind a closet door, where creatures speak and ancient prophecies await.",
    "An in-depth look into the lives of honeybees and their crucial role in global agriculture, narrated with stunning visuals.",
    "A group of mischievous penguins escapes the zoo and goes on a hilarious adventure to return to their homeland in Antarctica.",
    "A detective haunted by his past takes on a cold case involving a missing child and uncovers a conspiracy deeper than he imagined.",
    "A young woman from a small town chases her dream of becoming a Broadway star, facing setbacks and finding her voice.",
    "A group of explorers journeys through a newly discovered wormhole near Saturn in search of a new habitable planet, as Earth faces ecological collapse.",
    "A small-town detective investigates a series of mysterious disappearances, uncovering dark secrets that the townspeople desperately want to keep hidden.",
    "A struggling musician forms an unlikely friendship with a retired rock star, leading to a journey of self-discovery and redemption through music."
]
x_movie_descriptions=cv.transform(movie_descriptions)
y_ver=model_NB.predict(x_movie_descriptions)
display(y_ver)

array([' action ', ' comedy ', ' drama ', ' sci-fi ', ' horror ',
       ' fantasy ', ' documentary ', ' animation ', ' thriller ',
       ' drama ', ' sci-fi ', ' horror ', ' drama '], dtype='<U13')