<a href="https://colab.research.google.com/github/nik2043/Codsoft/blob/main/movie_genre_prediction_codsoft.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 🎬 Movie Genre Prediction using Plot Summary (Text Classification)

# ✅ Objective:
# Build a machine learning model that predicts the **genre** of a movie based on its **plot summary**,
# using TF-IDF and classifiers like Naive Bayes, Logistic Regression, or Support Vector Machine (SVM).


In [9]:
# 🧰 Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [1]:
# 📥 Step 1: Load Dataset

# Upload dataset from local system
from google.colab import files
uploaded = files.upload()


Saving imdb (1000 movies) in june 2022.csv to imdb (1000 movies) in june 2022.csv


In [None]:
# Read the dataset
import pandas as pd
df = pd.read_csv('imdb (1000 movies) in june 2022.csv')  # Replace with your dataset name
# Inspect the column names
print(df.columns)
# The correct column names are 'genre' and 'DETAIL ABOUT MOVIE\n'
df = df[['genre', 'DETAIL ABOUT MOVIE\n']]
df.dropna(inplace=True)
df.head()

In [11]:
# 📊 Step 2: Encode Labels and Split Data
from sklearn.preprocessing import LabelEncoder

# Encode genres into numbers
le = LabelEncoder()
df['genre_encoded'] = le.fit_transform(df['genre'])

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(df['DETAIL ABOUT MOVIE\n'], df['genre_encoded'], test_size=0.2, random_state=42)

In [12]:
# ✨ Step 3: TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tf = tfidf.fit_transform(X_train)
X_test_tf = tfidf.transform(X_test)


In [13]:
# 🤖 Step 4: Train Models

# 1. Naive Bayes
nb = MultinomialNB()
nb.fit(X_train_tf, y_train)
nb_pred = nb.predict(X_test_tf)

# 2. Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tf, y_train)
lr_pred = lr.predict(X_test_tf)

# 3. Support Vector Machine
svm = LinearSVC()
svm.fit(X_train_tf, y_train)
svm_pred = svm.predict(X_test_tf)


In [None]:
# 📈 Step 5: Evaluate Models
def evaluate_model(name, y_true, y_pred):
    print(f"\nModel: {name}")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    # Get the unique labels present in y_true
    unique_labels = np.unique(y_true)
    # Get the corresponding target names using the inverse transform of the label encoder
    target_names = le.inverse_transform(unique_labels)
    print("Classification Report:\n", classification_report(y_true, y_pred, labels=unique_labels, target_names=target_names, zero_division=0))
    cm = confusion_matrix(y_true, y_pred, labels=unique_labels)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Greens', xticklabels=target_names, yticklabels=target_names)
    plt.title(f"Confusion Matrix - {name}")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.show()

evaluate_model("Naive Bayes", y_test, nb_pred)
evaluate_model("Logistic Regression", y_test, lr_pred)
evaluate_model("SVM", y_test, svm_pred)

In [None]:
# 🧪 Step 6: Predict Genre from New Plot
def predict_genre(plot):
    vector = tfidf.transform([plot])
    genre_num = svm.predict(vector)[0]
    return le.inverse_transform([genre_num])[0]

# Example:
plot = plot = ("A brilliant but unorthodox detective investigates a series of brutal murders...")


print("Predicted Genre:", predict_genre(plot))