In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Define a function to load the data
def load_data(file_path, is_train=True):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if is_train:
                # Check if the line has enough fields before appending
                if len(parts) >= 4:
                    data.append((parts[0], parts[1], parts[2], parts[3]))
                else:
                    print(f"Skipping line: {line.strip()}") # Print a warning for lines with insufficient fields
            else:
                if len(parts) >= 3:
                    data.append((parts[0], parts[1], parts[2]))
                else:
                    print(f"Skipping line: {line.strip()}")
    return data

# Load the training data
train_data = load_data('train_data.txt', is_train=True)
train_df = pd.DataFrame(train_data, columns=['ID', 'Title', 'Genre', 'Description'])

# Load the test data
test_data = load_data('test_data.txt', is_train=False)
test_df = pd.DataFrame(test_data, columns=['ID', 'Title', 'Description'])

# Display the first few rows to verify
print(train_df.head())
print(test_df.head())

Skipping line: 11180 ::: Legion of the Black (2012) :::
  ID                             Title     Genre  \
0  1      Oscar et la dame rose (2009)     drama   
1  2                      Cupid (1997)  thriller   
2  3  Young, Wild and Wonderful (1980)     adult   
3  4             The Secret Sin (1915)     drama   
4  5            The Unrecovered (2007)     drama   

                                         Description  
0  Listening in to a conversation between his doc...  
1  A brother and sister with a past incestuous re...  
2  As the bus empties the students for their fiel...  
3  To help their unemployed father make ends meet...  
4  The film's title refers not only to the un-rec...  
  ID                        Title  \
0  1         Edgar's Lunch (1998)   
1  2     La guerra de papá (1977)   
2  3  Off the Beaten Track (2010)   
3  4       Meu Amigo Hindu (2015)   
4  5            Er nu zhai (1955)   

                                         Description  
0  L.R. Brane loves his

In [11]:
# Use the 'Description' column as features and 'Genre' column as target
X = train_df['Description']
y = train_df['Genre']

# Split the training data for validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Fit and transform the train data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

# Transform the validation and test data
X_val_tfidf = tfidf_vectorizer.transform(X_val)
X_test_tfidf = tfidf_vectorizer.transform(test_df['Description'])

In [12]:
# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
lr_predictions = lr_model.predict(X_val_tfidf)
print("Logistic Regression Accuracy: ", accuracy_score(y_val, lr_predictions))

# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)
nb_predictions = nb_model.predict(X_val_tfidf)
print("Naive Bayes Accuracy: ", accuracy_score(y_val, nb_predictions))

# Support Vector Machine
svm_model = SVC()
svm_model.fit(X_train_tfidf, y_train)
svm_predictions = svm_model.predict(X_val_tfidf)
print("SVM Accuracy: ", accuracy_score(y_val, svm_predictions))

Logistic Regression Accuracy:  0.5120751341681574
Naive Bayes Accuracy:  0.462432915921288
SVM Accuracy:  0.4928443649373882


In [13]:
print("Logistic Regression Classification Report:\n", classification_report(y_val, lr_predictions))
print("Naive Bayes Classification Report:\n", classification_report(y_val, nb_predictions))
print("SVM Classification Report:\n", classification_report(y_val, svm_predictions))

print("Logistic Regression Confusion Matrix:\n", confusion_matrix(y_val, lr_predictions))
print("Naive Bayes Confusion Matrix:\n", confusion_matrix(y_val, nb_predictions))
print("SVM Confusion Matrix:\n", confusion_matrix(y_val, svm_predictions))

Logistic Regression Classification Report:
               precision    recall  f1-score   support

      action       0.50      0.06      0.11        65
       adult       1.00      0.03      0.06        34
   adventure       0.00      0.00      0.00        33
   animation       0.00      0.00      0.00        15
   biography       0.00      0.00      0.00        17
      comedy       0.48      0.50      0.49       313
       crime       0.00      0.00      0.00        16
 documentary       0.58      0.86      0.70       548
       drama       0.45      0.78      0.57       530
      family       0.00      0.00      0.00        33
     fantasy       0.00      0.00      0.00        16
   game-show       1.00      0.12      0.22         8
     history       0.00      0.00      0.00         8
      horror       0.69      0.27      0.39        89
       music       1.00      0.09      0.16        35
     musical       0.00      0.00      0.00         7
     mystery       0.00      0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Naive Bayes Classification Report:
               precision    recall  f1-score   support

      action       1.00      0.02      0.03        65
       adult       0.00      0.00      0.00        34
   adventure       0.00      0.00      0.00        33
   animation       0.00      0.00      0.00        15
   biography       0.00      0.00      0.00        17
      comedy       0.56      0.29      0.39       313
       crime       0.00      0.00      0.00        16
 documentary       0.55      0.88      0.67       548
       drama       0.38      0.85      0.53       530
      family       0.00      0.00      0.00        33
     fantasy       0.00      0.00      0.00        16
   game-show       0.00      0.00      0.00         8
     history       0.00      0.00      0.00         8
      horror       0.25      0.01      0.02        89
       music       0.00      0.00      0.00        35
     musical       0.00      0.00      0.00         7
     mystery       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Classification Report:
               precision    recall  f1-score   support

      action       0.00      0.00      0.00        65
       adult       1.00      0.03      0.06        34
   adventure       0.33      0.03      0.06        33
   animation       0.00      0.00      0.00        15
   biography       0.00      0.00      0.00        17
      comedy       0.51      0.40      0.45       313
       crime       0.00      0.00      0.00        16
 documentary       0.57      0.86      0.68       548
       drama       0.41      0.83      0.55       530
      family       0.00      0.00      0.00        33
     fantasy       0.00      0.00      0.00        16
   game-show       1.00      0.12      0.22         8
     history       0.00      0.00      0.00         8
      horror       0.78      0.20      0.32        89
       music       1.00      0.06      0.11        35
     musical       0.00      0.00      0.00         7
     mystery       0.00      0.00      0.00         7

In [16]:
# ... (Previous code remains the same)

# Load the test data and ensure proper formatting
test_data = load_data('test_data.txt', is_train=False)
test_df = pd.DataFrame(test_data, columns=['ID', 'Title', 'Description'])

# ... (Rest of the code)

# Assuming Logistic Regression performed best
test_predictions = lr_model.predict(X_test_tfidf)

# Check the length of predictions and test data
print("Length of predictions:", len(test_predictions))
print("Number of rows in test data:", len(test_df))

# Save the predictions to a CSV file (only if lengths match)
if len(test_predictions) == len(test_df):
    test_df['predicted_genre'] = test_predictions
    test_df.to_csv('test_data_with_predictions.csv', index=False)
else:
    print("Error: Length of predictions does not match test data. Cannot assign predictions.")

# ... (Rest of the code for comparison with solutions)

Length of predictions: 45846
Number of rows in test data: 45846
