MOVIE GENRE CLASSIFICATION

1: Import Required Libraries

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC  # Use LinearSVC for faster training
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

2: Load Data

In [15]:
# 1. Load the training, testing, and solutions data
column_names_train = ["ID", "TITLE", "GENRE", "DESCRIPTION"]
train = pd.read_csv(r'D:\Mustufahussain\CodSoft\Machine Learning\Dataset\Genre Classification Dataset\train_data.txt', sep=':::', names=column_names_train, engine='python')

column_names_test = ["ID", "TITLE", "DESCRIPTION"]
test = pd.read_csv(r'D:\Mustufahussain\CodSoft\Machine Learning\Dataset\Genre Classification Dataset\test_data.txt', sep=':::', names=column_names_test, engine='python')

column_names_solution = ["ID", "TITLE", "GENRE", "DESCRIPTION"]
solutions = pd.read_csv(r'D:\Mustufahussain\CodSoft\Machine Learning\Dataset\Genre Classification Dataset\test_data_solution.txt', sep=':::', names=column_names_solution, engine='python')

3: Data Inspection

In [16]:
# 2. Data Inspection
print("\nMissing Values in Train Data:")
print(train.isna().sum())  # Check for missing values in train data


Missing Values in Train Data:
ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64


In [17]:
print("\nData Types in Train Data:")
print(train.dtypes)  # Check data types


Data Types in Train Data:
ID              int64
TITLE          object
GENRE          object
DESCRIPTION    object
dtype: object


In [18]:
print("\nMissing Values in Test Data:")
print(test.isna().sum())  # Check for missing values in test data


Missing Values in Test Data:
ID             0
TITLE          0
DESCRIPTION    0
dtype: int64


In [19]:
print("\nData Types in Test Data:")
print(test.dtypes)  # Check data types in test data


Data Types in Test Data:
ID              int64
TITLE          object
DESCRIPTION    object
dtype: object


4: Fill Missing Values

In [29]:
# 3. Fill missing values (if any)
print("Handling missing values in datasets...")
train['DESCRIPTION'] = train['DESCRIPTION'].fillna('No description available')
train['GENRE'] = train['GENRE'].fillna('Unknown')
test['DESCRIPTION'] = test['DESCRIPTION'].fillna('No description available')
solutions['GENRE'] = solutions['GENRE'].fillna('Unknown')

# Print the number of missing values after filling
print("\nMissing values after handling:")
print(f"Train Data:\n{train.isna().sum()}")
print(f"\nTest Data:\n{test.isna().sum()}")
print(f"\nSolutions Data:\n{solutions.isna().sum()}")


Handling missing values in datasets...

Missing values after handling:
Train Data:
ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64

Test Data:
ID             0
TITLE          0
DESCRIPTION    0
dtype: int64

Solutions Data:
ID             0
TITLE          0
GENRE          0
DESCRIPTION    0
dtype: int64


5: TF-IDF Vectorization

In [30]:
# 4. Vectorization using TF-IDF
print("Vectorizing text data using TF-IDF...")
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = vectorizer.fit_transform(train['DESCRIPTION'])
X_test_tfidf = vectorizer.transform(test['DESCRIPTION'])

# Print the shape of the TF-IDF matrices
print("\nTF-IDF Vectorization Complete.")
print(f"Shape of Train TF-IDF Matrix: {X_train_tfidf.shape}")
print(f"Shape of Test TF-IDF Matrix: {X_test_tfidf.shape}")


Vectorizing text data using TF-IDF...

TF-IDF Vectorization Complete.
Shape of Train TF-IDF Matrix: (54214, 5000)
Shape of Test TF-IDF Matrix: (54200, 5000)


6: Naive Bayes Model

In [26]:
# 7. Define and train the Naive Bayes model
print("\nTraining Naive Bayes...")
naive_bayes_model = MultinomialNB()
naive_bayes_model.fit(X_train, y_train)

# Predictions
y_pred_nb = naive_bayes_model.predict(X_test)

# Evaluation
accuracy_nb = accuracy_score(y_test, y_pred_nb)
conf_matrix_nb = confusion_matrix(y_test, y_pred_nb)

# Adjust classification report to handle undefined metrics
class_report_nb = classification_report(y_test, y_pred_nb, zero_division=0)

# Display the results
print("\nNaive Bayes Model Evaluation (on Test Split):")
print(f'Accuracy: {accuracy_nb}')
print(f'Confusion Matrix:\n{conf_matrix_nb}')
print(f'Classification Report:\n{class_report_nb}')



Training Naive Bayes...

Naive Bayes Model Evaluation (on Test Split):
Accuracy: 0.52310246241815
Confusion Matrix:
[[  21    0    0    0    0   19    0   51  155    0    0    0    0    8
     0    0    0    0    0    0    0    3    3    0    2    0    1]
 [   0    7    7    0    0   35    0    8   51    0    0    0    0    1
     0    0    0    0    0    0    0    3    0    0    0    0    0]
 [   3    1    4    0    0   15    0   35   71    0    0    0    0    6
     0    0    0    0    0    0    0    2    0    0    1    0    1]
 [   0    0    0    0    0   23    0   37   40    0    0    0    0    1
     0    0    0    0    0    0    0    3    0    0    0    0    0]
 [   0    0    0    0    0    2    0   42   17    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0]
 [   3    0    1    0    0  629    0  150  644    0    0    0    0    6
     1    0    0    0    0    0    0    9    0    0    0    0    0]
 [   1    0    0    0    0    8    0    8  

7: Logistic Regression Model

In [27]:
# 7. Define and train the Logistic Regression model
print("\nTraining Logistic Regression...")
log_reg_model = LogisticRegression(max_iter=1000)
log_reg_model.fit(X_train, y_train)

# Predictions
y_pred_lr = log_reg_model.predict(X_test)

# Evaluation
accuracy_lr = accuracy_score(y_test, y_pred_lr)
conf_matrix_lr = confusion_matrix(y_test, y_pred_lr)

# Adjust classification report to handle undefined metrics
class_report_lr = classification_report(y_test, y_pred_lr, zero_division=0)

# Display the results
print("\nLogistic Regression Model Evaluation (on Test Split):")
print(f'Accuracy: {accuracy_lr}')
print(f'Confusion Matrix:\n{conf_matrix_lr}')
print(f'Classification Report:\n{class_report_lr}')



Training Logistic Regression...

Logistic Regression Model Evaluation (on Test Split):
Accuracy: 0.5795444065295582
Confusion Matrix:
[[  68    0    1    0    0   26    2   32   94    0    0    0    0   12
     0    0    0    0    0    0    5    5    5    0   13    0    0]
 [   0   24   13    0    0   32    0    6   27    0    0    0    0    1
     0    0    0    0    0    0    0    7    0    0    1    0    1]
 [   6    0   19    0    0   20    0   27   37    1    0    0    0   10
     0    0    0    0    1    0    4    9    1    0    2    0    2]
 [   1    0    2    9    0   22    0   17   23    9    1    0    0    2
     1    0    0    0    0    0    6   11    0    0    0    0    0]
 [   0    0    0    0    0    2    0   38   18    0    0    0    0    0
     0    0    0    0    0    0    0    3    0    0    0    0    0]
 [   6    1    1    1    0  845    1   91  414    2    0    0    0   11
     2    0    0    0    7    1    1   49    2    1    4    0    3]
 [   7    0    0    0    

8: SVM Model

In [28]:
# 8. Define and train the SVM model
print("\nTraining Support Vector Machine (SVM)...")
svm_model = LinearSVC(max_iter=1000, C=0.1, dual=False)  # Explicitly set dual to False
svm_model.fit(X_train, y_train)

# Predictions
y_pred_svm = svm_model.predict(X_test)

# Evaluation
accuracy_svm = accuracy_score(y_test, y_pred_svm)
conf_matrix_svm = confusion_matrix(y_test, y_pred_svm)

# Adjust classification report to handle undefined metrics
class_report_svm = classification_report(y_test, y_pred_svm, zero_division=0)

# Display the results
print("\nSVM Model Evaluation (on Test Split):")
print(f'Accuracy: {accuracy_svm}')
print(f'Confusion Matrix:\n{conf_matrix_svm}')
print(f'Classification Report:\n{class_report_svm}')



Training Support Vector Machine (SVM)...

SVM Model Evaluation (on Test Split):
Accuracy: 0.5776998985520613
Confusion Matrix:
[[  59    1    0    0    0   26    2   34   97    0    0    0    0   16
     0    0    0    0    0    0    6    5    6    0    9    0    2]
 [   0   26   14    0    0   29    0    5   28    0    0    0    0    1
     0    0    0    0    1    0    0    7    0    0    0    0    1]
 [   7    1   22    0    0   16    0   29   35    1    0    0    0   10
     0    0    0    0    0    0    6    6    1    0    1    0    4]
 [   1    0    1   11    0   22    0   20   24    7    1    0    0    4
     1    0    0    0    0    0    5    7    0    0    0    0    0]
 [   0    0    0    0    0    3    0   38   17    0    0    0    0    0
     0    0    0    0    0    0    0    2    0    0    0    0    1]
 [   5    2    0    1    0  801    0  105  447    2    0    2    0   17
     5    0    0    0    5    0    2   36    3    1    5    0    4]
 [   6    0    0    0    0   21 

9: Test Data Predictions

In [25]:
# 9. Predictions on the Test Data
X_test_transformed = vectorizer.transform(test['DESCRIPTION'])

# Make predictions with Naive Bayes
naive_bayes_predictions = naive_bayes_model.predict(X_test_transformed)
naive_bayes_predictions_df = pd.DataFrame(naive_bayes_predictions, columns=['genre_prediction'])
print("\nNaive Bayes Predictions on Test Data:")
print(naive_bayes_predictions_df.head())

# Make predictions with Logistic Regression
log_reg_predictions = log_reg_model.predict(X_test_transformed)
log_reg_predictions_df = pd.DataFrame(log_reg_predictions, columns=['genre_prediction'])
print("\nLogistic Regression Predictions on Test Data:")
print(log_reg_predictions_df.head())

# Make predictions with SVM
svm_predictions = svm_model.predict(X_test_transformed)
svm_predictions_df = pd.DataFrame(svm_predictions, columns=['genre_prediction'])
print("\nSVM Predictions on Test Data:")
print(svm_predictions_df.head())


Naive Bayes Predictions on Test Data:
  genre_prediction
0           drama 
1           drama 
2     documentary 
3           drama 
4           drama 

Logistic Regression Predictions on Test Data:
  genre_prediction
0           short 
1           drama 
2     documentary 
3           drama 
4           drama 

SVM Predictions on Test Data:
  genre_prediction
0           drama 
1           drama 
2     documentary 
3           drama 
4           drama 
