In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [2]:
!kaggle datasets download -d guru001/movie-genre-prediction

Dataset URL: https://www.kaggle.com/datasets/guru001/movie-genre-prediction
License(s): DbCL-1.0
Downloading movie-genre-prediction.zip to /content
 72% 5.00M/6.94M [00:00<00:00, 46.6MB/s]
100% 6.94M/6.94M [00:00<00:00, 61.3MB/s]


In [3]:
import zipfile
zip_ref = zipfile.ZipFile('movie-genre-prediction.zip', 'r')
zip_ref.extractall()
zip_ref.close()

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [30]:
train_data = pd.read_csv('/content/train.csv')
test_data = pd.read_csv('/content/test.csv')
sample_submission = pd.read_csv('/content/sample_submission.csv')
print(train_data.head())
print(test_data.head())
print(sample_submission.head())


      id                                         movie_name  \
0  44978                                           Super Me   
1  50185                                     Entity Project   
2  34131  Behavioral Family Therapy for Serious Psychiat...   
3  78522                                      Blood Glacier   
4   2206                                      Apat na anino   

                                            synopsis    genre  
0  A young scriptwriter starts bringing valuable ...  fantasy  
1  A director and her friends renting a haunted h...   horror  
2  This is an educational video for families and ...   family  
3  Scientists working in the Austrian Alps discov...    scifi  
4  Buy Day - Four Men Widely - Apart in Life - By...   action  
      id          movie_name  \
0  16863    A Death Sentence   
1  48456          Intermedio   
2  41383    30 Chua Phai Tet   
3  84007           Paranoiac   
4  40269  Ordinary Happiness   

                                            

In [31]:
train_data['synopsis'][3]

'Scientists working in the Austrian Alps discover that a glacier is leaking a liquid that appears to be affecting local wildlife.'

In [32]:
print(train_data.shape)
print(test_data.shape)
print(sample_submission.shape)

(54000, 4)
(36000, 4)
(36000, 2)


In [37]:
X_train_texts = train_data[['movie_name','synopsis']]
y_train = train_data['genre']
X_test_texts = test_data[['movie_name','synopsis']]

In [42]:
X_train_combined = X_train_texts['movie_name'] + ' ' + X_train_texts['synopsis']
X_test_combined = X_test_texts['movie_name'] + ' ' + X_test_texts['synopsis']

tf_idf = TfidfVectorizer(stop_words='english', max_features=2000)

X_train = tf_idf.fit_transform(X_train_combined)

X_test = tf_idf.transform(X_test_combined)

In [43]:
print(X_train)

  (0, 1471)	0.29353534677396786
  (0, 1091)	0.29249883330495946
  (0, 1562)	0.378798806012096
  (0, 444)	0.3148953865719784
  (0, 1228)	0.3557654045995397
  (0, 1590)	0.32041095258291213
  (0, 1887)	0.38520981158847256
  (0, 1662)	0.2891229794495027
  (0, 1993)	0.16433166438822286
  (0, 1712)	0.3121185656664266
  (1, 1339)	0.31993090355285425
  (1, 1379)	0.30793410495562995
  (1, 1261)	0.23682318115654416
  (1, 582)	0.2562810429563408
  (1, 1283)	0.3261767331080127
  (1, 233)	0.3324240191558813
  (1, 864)	0.23006002209683163
  (1, 798)	0.26737215444334234
  (1, 705)	0.20559944434534022
  (1, 481)	0.31154877733191527
  (1, 1374)	0.28694985538358286
  (1, 568)	0.34361191436956423
  (2, 427)	0.44140985739069505
  (2, 623)	0.4033142743091302
  (2, 1902)	0.3851647654887759
  :	:
  (53997, 235)	0.21571959737015622
  (53997, 1133)	0.3804447844839613
  (53998, 392)	0.2962018834518496
  (53998, 1840)	0.26174765058356503
  (53998, 565)	0.3307247780144783
  (53998, 1316)	0.26311091967601735
  (53

In [44]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2)

In [45]:
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((43200, 2000), (10800, 2000), (43200,), (10800,))

In [46]:
NB_model = MultinomialNB()
NB_model.fit(X_train, y_train)

y_pred = NB_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Naive Bayes Accuracy:", accuracy)
print(classification_report(y_val, y_pred))

y_pred = NB_model.predict(X_test)
sample_submission['genre'] = y_pred
sample_submission.to_csv('submission_NB.csv', index=False)

Naive Bayes Accuracy: 0.35638888888888887
              precision    recall  f1-score   support

      action       0.31      0.26      0.28      1086
   adventure       0.27      0.23      0.25      1022
       crime       0.35      0.40      0.38      1089
      family       0.39      0.44      0.42      1056
     fantasy       0.31      0.27      0.29      1085
      horror       0.39      0.45      0.42      1077
     mystery       0.33      0.28      0.30      1143
     romance       0.42      0.56      0.48      1079
       scifi       0.43      0.50      0.46      1109
    thriller       0.25      0.14      0.18      1054

    accuracy                           0.36     10800
   macro avg       0.34      0.36      0.35     10800
weighted avg       0.34      0.36      0.35     10800



In [49]:
LR_model = LogisticRegression(max_iter=1000)
LR_model.fit(X_train, y_train)

y_pred = LR_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Logistic Regression Accuracy:", accuracy)
print(classification_report(y_val, y_pred))

y_pred = LR_model.predict(X_test)
sample_submission['genre'] = y_pred
sample_submission.to_csv('submission_LR.csv', index=False)

Logistic Regression Accuracy: 0.3484259259259259
              precision    recall  f1-score   support

      action       0.29      0.26      0.27      1086
   adventure       0.25      0.24      0.25      1022
       crime       0.34      0.35      0.35      1089
      family       0.39      0.46      0.43      1056
     fantasy       0.31      0.27      0.28      1085
      horror       0.39      0.44      0.42      1077
     mystery       0.32      0.29      0.30      1143
     romance       0.45      0.52      0.48      1079
       scifi       0.42      0.48      0.45      1109
    thriller       0.22      0.17      0.19      1054

    accuracy                           0.35     10800
   macro avg       0.34      0.35      0.34     10800
weighted avg       0.34      0.35      0.34     10800



In [48]:
def predict_genre(description):
    """
    Predict the genre of a movie based on its plot description.

    Args:
    description (str): The plot description of the movie.

    Returns:
    dict: A dictionary with predictions from both models.
    """
    # Transform the input description using the trained TF-IDF vectorizer
    description_transformed = tf_idf.transform([description])

    # Predict genre using trained models
    prediction_nb = NB_model.predict(description_transformed)[0]
    prediction_lr = LR_model.predict(description_transformed)[0]

    return {
        'Naive Bayes': prediction_nb,
        'Logistic Regression': prediction_lr
    }

# Example usage
example_description = "A young wizard embarks on a journey to find magical artifacts."
predicted_genres = predict_genre(example_description)
print(predicted_genres)

{'Naive Bayes': 'fantasy', 'Logistic Regression': 'fantasy'}
