In [1]:
# Importing Libraries
import pandas as pd  # Reading Data
from sklearn.naive_bayes import MultinomialNB  # The Prediciton Model
from sklearn.feature_extraction.text import CountVectorizer  # The transforming To Vector Tool
from sklearn.pipeline import Pipeline  # Runing Line by line Tool
from sklearn.model_selection import train_test_split  # spiliting tool
from sklearn.metrics import confusion_matrix, accuracy_score  # visualization
import matplotlib.pyplot as plt  # visualization
import seaborn as sn  # visualization
import warnings

warnings.filterwarnings("ignore")

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd  # Reading Data


In [2]:
# Reading the Data
TestData = pd.read_csv("Data/test_data.txt",
                       sep=":::",
                       names=["n", "TITLE", "DESCRIPTION"])
TestData = TestData[["TITLE", "DESCRIPTION"]]
TestDataSol = pd.read_csv("Data/test_data_solution.txt",
                          sep=":::",
                          names=["TITLE", "GENRE", "DESCRIPTION"])
TrainData = pd.read_csv("Data/train_data.txt",
                        sep=":::",
                        names=["TITLE", "GENRE", "DESCRIPTION"])

In [3]:
TestData.head()

Unnamed: 0,TITLE,DESCRIPTION
0,Edgar's Lunch (1998),"L.R. Brane loves his life - his car, his apar..."
1,La guerra de papá (1977),"Spain, March 1964: Quico is a very naughty ch..."
2,Off the Beaten Track (2010),One year in the life of Albin and his family ...
3,Meu Amigo Hindu (2015),"His father has died, he hasn't spoken with hi..."
4,Er nu zhai (1955),Before he was known internationally as a mart...


In [4]:
TestDataSol.head()

Unnamed: 0,TITLE,GENRE,DESCRIPTION
1,Edgar's Lunch (1998),thriller,"L.R. Brane loves his life - his car, his apar..."
2,La guerra de papá (1977),comedy,"Spain, March 1964: Quico is a very naughty ch..."
3,Off the Beaten Track (2010),documentary,One year in the life of Albin and his family ...
4,Meu Amigo Hindu (2015),drama,"His father has died, he hasn't spoken with hi..."
5,Er nu zhai (1955),drama,Before he was known internationally as a mart...


In [5]:
TrainData.head()

Unnamed: 0,TITLE,GENRE,DESCRIPTION
1,Oscar et la dame rose (2009),drama,Listening in to a conversation between his do...
2,Cupid (1997),thriller,A brother and sister with a past incestuous r...
3,"Young, Wild and Wonderful (1980)",adult,As the bus empties the students for their fie...
4,The Secret Sin (1915),drama,To help their unemployed father make ends mee...
5,The Unrecovered (2007),drama,The film's title refers not only to the un-re...


In [6]:
# cleaning The Data
import nltk
from nltk.stem import LancasterStemmer
from nltk.corpus import stopwords
import re
import string
# Initialize stemmer and stopwords
stemmer = LancasterStemmer()
stop_words = set(stopwords.words("english"))


def cleaning_data(text):
    text = text.lower()
    text = re.sub(r'@\S+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'.pic\S+', '', text)
    text = re.sub(r'[^a-zA-Z+]', ' ', text)
    text = "".join([i for i in text if i not in string.punctuation])
    words = nltk.word_tokenize(text)
    text = " ".join([i for i in words if i not in stop_words and len(i) > 2])
    text = re.sub(r"\s+", " ", text).strip()
    return text


TrainData["TextCleaning"] = TrainData["DESCRIPTION"].apply(cleaning_data)
TestData["TextCleaning"] = TestData["DESCRIPTION"].apply(cleaning_data)

In [7]:
# Diffrent Types of Movies
print(len(TrainData.GENRE.unique()))
TrainData.GENRE.unique()

27


array([' drama ', ' thriller ', ' adult ', ' documentary ', ' comedy ',
       ' crime ', ' reality-tv ', ' horror ', ' sport ', ' animation ',
       ' action ', ' fantasy ', ' short ', ' sci-fi ', ' music ',
       ' adventure ', ' talk-show ', ' western ', ' family ', ' mystery ',
       ' history ', ' news ', ' biography ', ' romance ', ' game-show ',
       ' musical ', ' war '], dtype=object)

In [8]:
# Encoding Them
from sklearn.preprocessing import LabelEncoder

GENRElabel = LabelEncoder()
TrainData['GENRE_n'] = GENRElabel.fit_transform(TrainData['GENRE'])

# droping old coulmn
TrainData = TrainData.drop("GENRE", axis=1)

print("Elemnts count : ", 1 + TrainData.GENRE_n.max())

# Displaying the dataframe
TrainData.head()

Elemnts count :  27


Unnamed: 0,TITLE,DESCRIPTION,TextCleaning,GENRE_n
1,Oscar et la dame rose (2009),Listening in to a conversation between his do...,listening conversation doctor parents year old...,8
2,Cupid (1997),A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...,24
3,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...,bus empties students field trip museum natural...,1
4,The Secret Sin (1915),To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...,8
5,The Unrecovered (2007),The film's title refers not only to the un-re...,film title refers recovered bodies ground zero...,8


In [9]:
TrainData.head()

Unnamed: 0,TITLE,DESCRIPTION,TextCleaning,GENRE_n
1,Oscar et la dame rose (2009),Listening in to a conversation between his do...,listening conversation doctor parents year old...,8
2,Cupid (1997),A brother and sister with a past incestuous r...,brother sister past incestuous relationship cu...,24
3,"Young, Wild and Wonderful (1980)",As the bus empties the students for their fie...,bus empties students field trip museum natural...,1
4,The Secret Sin (1915),To help their unemployed father make ends mee...,help unemployed father make ends meet edith tw...,8
5,The Unrecovered (2007),The film's title refers not only to the un-re...,film title refers recovered bodies ground zero...,8


In [10]:
# Writing Steps in a model to fiting them using Pipeline
clf = Pipeline([('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [11]:
# training the model
clf.fit(TrainData.TextCleaning, TrainData.GENRE_n)

In [12]:
# making prediction of the test
y_pred = clf.predict(TestData.TextCleaning)

y_true = GENRElabel.fit_transform(TestDataSol['GENRE'])

# The model Accuracy
accuracy = accuracy_score(y_true, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 53.75%


In [13]:
# Predicting manual
Descriptions = [
    "Listening in to a conversation between his doctor and parents, 10-year-old Oscar learns what nobody has the courage to tell him. He only has a few weeks to live. Furious, he refuses to speak to anyone except straight-talking Rose, the lady in pink he meets on the hospital stairs. As Christmas approaches, Rose uses her fantastical experiences as a professional wrestler, her imagination, wit and charm to allow Oscar to live life and love to the full, in the company of his friends Pop Corn, Einstein, Bacon and childhood sweetheart Peggy Blue.",
    "In tough economic times Max and Joey have all but run out of ideas until, they discover that senior housing is cheap. Not only that but Max's aunt just kicked the bucket and no one knows yet. In a hilarious series that always keeps you on your toes, the two friends take us on a cross-dressing, desperate and endearing ride through being broke."
]
print("Prediction:", GENRElabel.inverse_transform(clf.predict(Descriptions)))

Prediction: [' drama ' ' comedy ']


In [15]:
# GUI
import ipywidgets as widgets
from IPython.display import display


# Define a function to be called when the button is clicked
def on_button_click(b):
    text_value = text_box.value
    print("Film's GENRE is : ",
          GENRElabel.inverse_transform([int(clf.predict([text_value]))]))


# Create button and text box widgets
button = widgets.Button(description="PREDICT")
text_box = widgets.Text(placeholder="Enter description")
text_box.layout.width = '500px'
text_box.layout.height = '30px'

# Attach the function to the button click event
button.on_click(on_button_click)

# Display the widgets
display(text_box)
display(button)

Text(value='', layout=Layout(height='30px', width='500px'), placeholder='Enter description')

Button(description='PREDICT', style=ButtonStyle())

Film's GENRE is :  [' horror ']
Film's GENRE is :  [' comedy ']
