In [86]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

In [87]:
# Load the Netflix dataset into a DataFrame
netflix_df = pd.read_csv('../data/netflix-pre-processed.csv')

In [88]:
# Filter the DataFrame to only include movies
movies_df = netflix_df[netflix_df['type'] == 'Movie']

In [89]:
# Replace "[" and "]" characters in the "listed_in" column with empty strings
movies_df['listed_in'] = movies_df['listed_in'].str.replace('[\[\]]', '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies_df['listed_in'] = movies_df['listed_in'].str.replace('[\[\]]', '')


In [90]:
# Split the "listed_in" column into separate rows for each genre
genres_df = movies_df.assign(listed_in=movies_df['listed_in'].str.split(',')).explode('listed_in')

In [91]:
# Define the stopwords for English
nltk.download('stopwords')
stopwords = stopwords.words('english')

[nltk_data] Downloading package stopwords to C:\Users\SQ-
[nltk_data]     PC/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [92]:
# Preprocess the text data
genres_df['description'] = genres_df['description'].str.lower()
genres_df['description'] = genres_df['description'].str.replace('[^\w\s]','')
genres_df['description'] = genres_df['description'].str.replace('\d+', '')
genres_df['description'] = genres_df['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))


In [93]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(genres_df['description'], genres_df['listed_in'], test_size=0.2, random_state=42)

In [94]:
# Create a TF-IDF representation of the text data
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [95]:
# Train a Naive Bayes classifier on the TF-IDF representation
clf = MultinomialNB()
clf.fit(X_train_tfidf, y_train)

In [96]:
# Evaluate the performance of the trained model on the testing set
# y_pred = clf.predict(X_test_counts)
y_pred = clf.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro'))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 score:', f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.1751990898748578
Precision: 0.05254951936484809
Recall: 0.031622654046354434
F1 score: 0.01786296541172713


  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
# # Use the trained model to predict the genre of new movies based on their descriptions
new_movie_descriptions = ['A young woman becomes the fourth wife of a wealthy lord in 19th century China.',
                          'A young boy befriends a giant robot from outer space and embarks on a series of adventures.']
new_movie_counts = vectorizer.transform(new_movie_descriptions)
new_movie_predictions = clf.predict(new_movie_counts)
print('New movie predictions:', new_movie_predictions)

New movie predictions: [' International Movies' ' International Movies']


In [101]:
# Define the range of values to search for the alpha parameter
param_grid = {'alpha': [0.1, 0.5, 1.0, 2.0, 5.0, 10.0]}

# Use grid search with cross-validation to find the best value for the alpha parameter
clf = MultinomialNB()
grid_search = GridSearchCV(clf, param_grid, cv=5, scoring='f1_macro')
grid_search.fit(X_train_tfidf, y_train)

# Print the best value for the alpha parameter found by grid search
print('Best alpha:', grid_search.best_params_['alpha'])

# Train a Naive Bayes classifier on the TF-IDF representation with the best alpha value
clf = MultinomialNB(alpha=grid_search.best_params_['alpha'])
clf.fit(X_train_tfidf, y_train)

# Evaluate the performance of the trained model on the testing set using the TF-IDF representation with the best alpha value
y_pred = clf.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro'))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 score:', f1_score(y_test, y_pred, average='macro'))



Best alpha: 0.1
Accuracy: 0.10504361016306409
Precision: 0.059529069609868125
Recall: 0.05418680799598454
F1 score: 0.0559461213351196


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [102]:
# # Use the trained model to predict the genre of new movies based on their descriptions
new_movie_descriptions = ['A young woman becomes the fourth wife of a wealthy lord in 19th century China.',
                          'A young boy befriends a giant robot from outer space and embarks on a series of adventures.']
new_movie_counts = vectorizer.transform(new_movie_descriptions)
new_movie_predictions = clf.predict(new_movie_counts)
print('New movie predictions:', new_movie_predictions)

New movie predictions: [' International Movies' 'Children & Family Movies']


In [103]:
# Define the oversampler to use
oversampler = RandomOverSampler(random_state=42)

# Oversample the minority classes in the training set
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train_tfidf, y_train)

# Train a Naive Bayes classifier on the TF-IDF representation with the resampled training set
clf = MultinomialNB(alpha=grid_search.best_params_['alpha'])
clf.fit(X_train_resampled, y_train_resampled)

# Evaluate the performance of the trained model on the testing set using the TF-IDF representation with the resampled training set
y_pred = clf.predict(X_test_tfidf)
print('Accuracy:', accuracy_score(y_test, y_pred))
print('Precision:', precision_score(y_test, y_pred, average='macro'))
print('Recall:', recall_score(y_test, y_pred, average='macro'))
print('F1 score:', f1_score(y_test, y_pred, average='macro'))

Accuracy: 0.10011376564277588
Precision: 0.09546424797536505
Recall: 0.08482729837052373
F1 score: 0.08416162984737445


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
# # Use the trained model to predict the genre of new movies based on their descriptions
new_movie_descriptions = ['A young woman becomes the fourth wife of a wealthy lord in 19th century China.',
                          'A young boy befriends a giant robot from outer space and embarks on a series of adventures.']
new_movie_counts = vectorizer.transform(new_movie_descriptions)
new_movie_predictions = clf.predict(new_movie_counts)
print('New movie predictions:', new_movie_predictions)

New movie predictions: ['Action & Adventure' 'Children & Family Movies']
