# Importing :)

In [None]:
import pandas as pd
import string

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report







# getting the data:




In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
movies_data = pd.read_csv("drive/My Drive/movies_genres_and_description.csv")

In [None]:
movies_data.head()

# Exploring the data

What do overviews look like?

In [None]:
movies_data.overview.loc[0]

In [None]:
movies_data.overview.loc[1]

How long are the overviews? The longest overview? Shortest overview?


In [None]:
movies_data.dropna( inplace=True)

In [None]:
movies_data['overview_length'] = movies_data.overview.apply(len)

In [None]:
movies_data['overview_length'].mean()

In [None]:
movies_data['overview_length'].min()

In [None]:
movies_data['overview_length'].max()

In [None]:
movies_data.overview_length.plot.hist()

What are the most frequent words in the overviews? In the overviews of a specific genre?


In [None]:
text = list(movies_data[movies_data["is_drama"] == True].overview.values)

wordcloud = WordCloud(stopwords=STOPWORDS).generate(str(text))

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [None]:
text = list(movies_data[movies_data["is_drama"] == True].overview.values)
text[:10]

# Cleaning & Preprocessing The Data

let's remove all the short descriptions

In [None]:
movies_data = movies_data.loc[movies_data["overview_length"] > 10]

In [None]:
movies_data.overview_length.min()

let's remove punctuation




In [None]:
exclude = set(string.punctuation)
def remove_punctoation(row):
  row.overview = ''.join(ch for ch in row.overview if ch not in exclude)
  return row

In [None]:
movies_data = movies_data.apply(remove_punctoation, axis = 1)

In [None]:
movies_data.head()

Lemmatisation

https://www.geeksforgeeks.org/python-lemmatization-with-nltk/#:~:text=Lemmatization%20is%20the%20process%20of,similar%20meaning%20to%20one%20word.

In [None]:
lemmatizer = WordNetLemmatizer()
def lemmatize_text(row):
  row.overview = " ".join(lemmatizer.lemmatize(word) for word in row.overview.split())
  return row

In [None]:
movies_data = movies_data.apply(lemmatize_text, axis = 1)

In [None]:
movies_data.head()

# Converting the data into vectors

count vectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [None]:
vectorizer = CountVectorizer(lowercase = True, stop_words=STOPWORDS, max_features= 2000)
count_vectors = vectorizer.fit_transform(movies_data.overview)

In [None]:
count_vectors.toarray()

TFIDF vectorizer

https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html

In [None]:
TFIDF_vectorizer = TfidfVectorizer(lowercase = True, stop_words=STOPWORDS, max_features = 2000)
tfidf_vectors = TFIDF_vectorizer.fit_transform(movies_data.overview)

In [None]:
tfidf_vectors.toarray()

# Modeling

Split to Train and Test

https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

In [None]:
X_count_train, X_count_test, y_count_train, y_count_test = train_test_split(tfidf_vectors, list(movies_data.is_drama), test_size=0.2, random_state=42)

In [None]:
X_tfidf_train, X_tfidf_test, y_tfidf_train, y_tfidf_test = train_test_split(tfidf_vectors, list(movies_data.is_drama), test_size=0.2, random_state=42)

Multinomial naive bayes

https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html#:~:text=The%20multinomial%20Naive%20Bayes%20classifier,tf%2Didf%20may%20also%20work.

In [None]:
model = MultinomialNB()
model.fit(X_count_train, y_count_train)
score_train = model.score(X_count_train, y_count_train)
score_test = model.score(X_count_test, y_count_test)
print("\nTrain set score:", score_train)
print("Test set score:", score_test)

In [None]:
y_pred = model.predict(X_count_test)

In [None]:
dt_model = DecisionTreeClassifier(max_depth=15, random_state=0)
dt_model.fit(X_tfidf_train, y_tfidf_train)
score_train = dt_model.score(X_tfidf_train, y_tfidf_train)
score_test = dt_model.score(X_tfidf_test, y_tfidf_test)
print("\nTrain set score:", score_train)
print("Test set score:", score_test)

In [None]:
y_dt_pred = dt_model.predict(X_tfidf_test)

# Evaluating 

Confusion matrix

https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html

In [None]:
confusion_matrix(y_count_test, y_pred)

Other metrics

In [None]:
print(classification_report(y_test, y_pred))

# Inference

In [None]:
inference_l = []
# https://www.imdb.com/title/tt6723592/?ref_=hm_inth_tt_i_1
Tenet = "Armed with only one word, Tenet, and fighting for the survival of the entire world, a Protagonist journeys through a twilight world of international espionage on a mission that will unfold in something beyond real time."
inference_l.append({'original_title': 'Tenet',
                     'overview': Tenet,
                     'is_drama': False})

# https://www.imdb.com/title/tt7772582/?ref_=hm_inth_tt_i_3
NRSO = "A pair of teenage girls in rural Pennsylvania travel to New York City to seek out medical help after an unintended pregnancy."
inference_l.append({'original_title': 'Never Rarely Sometimes Always',
                     'overview': NRSO,
                     'is_drama': True})

# https://www.imdb.com/title/tt9606374/?ref_=hm_inth_tt_i_4
On_the_Rocks = "A young mother reconnects with her larger-than-life playboy father on an adventure through New York."
inference_l.append({'original_title': 'On the Rocks',
                     'overview': On_the_Rocks,
                     'is_drama': True})

inference_ds = pd.DataFrame(inference_l)

In [None]:
inference_ds

In [None]:
inference_ds = inference_ds.apply(remove_punctoation, axis = 1)
inference_ds = inference_ds.apply(lemmatize_text, axis = 1)
inference_vectors = vectorizer.transform(inference_ds.overview)

In [None]:
model.predict(inference_vectors)