In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Step 1: Data preprocessing
def load_data(train_file, test_file, description_file):
    with open(description_file, 'r') as f:
        desc = f.read()

    train_columns = desc.split('\n\n')[0].split('\n')[1].split(' ::: ')
    test_columns = desc.split('\n\n')[1].split('\n')[1].split(' ::: ')

    train_data = pd.read_csv(train_file, sep=':::', header=None, names=train_columns, engine='python')
    test_data = pd.read_csv(test_file, sep=':::', header=None, names=test_columns, engine='python')

    return train_data, test_data

train_data, test_data = load_data('train_data.txt', 'test_data.txt', 'description.txt')



In [2]:
# Step 2: Feature extraction
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train = tfidf.fit_transform(train_data['DESCRIPTION'])
X_test = tfidf.transform(test_data['DESCRIPTION'])
y_train = train_data['GENRE']


In [3]:

# Step 3: Model selection and training
def train_model(model, X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    val_pred = model.predict(X_val)
    val_accuracy = accuracy_score(y_val, val_pred)
    return model, val_accuracy

models = {
    'Naive Bayes': MultinomialNB(),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'SVM': SVC(kernel='linear')
}

results = {}
for name, model in models.items():
    trained_model, val_accuracy = train_model(model, X_train, y_train)
    results[name] = (trained_model, val_accuracy)

# Select the best model
best_model_name = max(results, key=lambda k: results[k][1])
best_model, best_accuracy = results[best_model_name]

print(f"Best model: {best_model_name}")
print(f"Validation accuracy: {best_accuracy:.4f}")



Best model: SVM
Validation accuracy: 0.5838


In [5]:
# Step 4: Evaluation
y_pred = best_model.predict(X_test)





In [7]:
# Save predictions to a file
test_data['PREDICTED_GENRE'] = y_pred
test_data[['ID', 'PREDICTED_GENRE']].to_csv('test-data-predictions.txt', sep='|', index=False, header=False)


print("Predictions saved to 'test-data-predictions.txt'")

Predictions saved to 'test-data-predictions.txt'


In [12]:
# Load the actual content of test_data_solution.txt
try:
    solution_data_raw = pd.read_csv('test_data_solution.txt', sep=':::', header=None, engine='python')
    print(solution_data_raw.head())
except FileNotFoundError:
    print("Test data solution file not found.")
    solution_data_raw = None

# Check the structure and content of solution_data_raw
if solution_data_raw is not None:
    print(solution_data_raw.head())


   0                              1              2  \
0  1          Edgar's Lunch (1998)       thriller    
1  2      La guerra de papá (1977)         comedy    
2  3   Off the Beaten Track (2010)    documentary    
3  4        Meu Amigo Hindu (2015)          drama    
4  5             Er nu zhai (1955)          drama    

                                                   3  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  
   0                              1              2  \
0  1          Edgar's Lunch (1998)       thriller    
1  2      La guerra de papá (1977)         comedy    
2  3   Off the Beaten Track (2010)    documentary    
3  4        Meu Amigo Hindu (2015)          drama    
4  5             Er nu zhai (1955)          drama    

                    

In [23]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

# Load test data predictions
try:
    test_data = pd.read_csv('test-data-predictions.txt', sep='|', header=None, names=['ID', 'PREDICTED_GENRE'], engine='python')
except FileNotFoundError:
    print("Test data predictions file not found.")
    test_data = None

try:
    # Load solution data and select relevant columns
    solution_data = pd.read_csv('test_data_solution.txt', sep=':::', header=None, engine='python')
    solution_data = solution_data[[0, 2]]  # Select only the ID and GENRE columns
    solution_data.columns = ['ID', 'GENRE']  # Rename columns
except FileNotFoundError:
    print("Test data solution file not found.")
    solution_data = None

if test_data is not None and solution_data is not None:
    # Ensure ID columns are of the same type and strip whitespace
    test_data['ID'] = test_data['ID'].astype(str).str.strip()
    solution_data['ID'] = solution_data['ID'].astype(str).str.strip()

    # Debugging: print the unique IDs from both datasets
    print(f"Unique IDs in test data: {test_data['ID'].unique()[:10]}")
    print(f"Unique IDs in solution data: {solution_data['ID'].unique()[:10]}")

    # Merge the data on ID
    merged_data = pd.merge(test_data, solution_data, on='ID')

    # Check for missing values and mismatched rows
    if merged_data.isnull().values.any():
        print("Warning: Merged data contains missing values.")
        print(merged_data[merged_data.isnull().any(axis=1)])

    # Ensure there are no mismatched genres
    if merged_data.empty:
        print("No matching IDs found between predictions and solution data.")
    else:
        # Calculate and print accuracy
        test_accuracy = accuracy_score(merged_data['GENRE'], merged_data['PREDICTED_GENRE'])
        print(f"Test accuracy: {test_accuracy:.4f}")

        # Print classification report
        print("\nClassification Report:")
        print(classification_report(merged_data['GENRE'], merged_data['PREDICTED_GENRE']))



else:
    print("Unable to calculate test accuracy due to missing files.")


Unique IDs in test data: ['1' '2' '3' '4' '5' '6' '7' '8' '9' '10']
Unique IDs in solution data: ['1' '2' '3' '4' '5' '6' '7' '8' '9' '10']
Test accuracy: 0.5797

Classification Report:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


               precision    recall  f1-score   support

      action        0.43      0.34      0.38      1314
       adult        0.60      0.34      0.43       590
   adventure        0.53      0.22      0.31       775
   animation        0.43      0.11      0.18       498
   biography        0.00      0.00      0.00       264
      comedy        0.51      0.57      0.54      7446
       crime        0.25      0.04      0.07       505
 documentary        0.67      0.83      0.75     13096
       drama        0.54      0.75      0.63     13612
      family        0.41      0.09      0.15       783
     fantasy        0.45      0.06      0.10       322
   game-show        0.86      0.61      0.71       193
     history        1.00      0.00      0.01       243
      horror        0.64      0.59      0.61      2204
       music        0.67      0.47      0.55       731
     musical        0.29      0.04      0.07       276
     mystery        0.43      0.02      0.04       318
        n

  _warn_prf(average, modifier, msg_start, len(result))


In [3]:
import pandas as pd

# Load test data predictions
try:
    test_data = pd.read_csv('test-data-predictions.txt', sep='|', header=None, names=['ID', 'PREDICTED_GENRE'], engine='python')
except FileNotFoundError:
    print("Test data predictions file not found.")
    test_data = None

try:
    # Load solution data and select relevant columns
    solution_data = pd.read_csv('test_data_solution.txt', sep=':::', header=None, engine='python')
    solution_data = solution_data[[0, 1]]  # Select only the ID and GENRE columns
    solution_data.columns = ['ID', 'GENRE']  # Rename columns
except FileNotFoundError:
    print("Test data solution file not found.")
    solution_data = None

if test_data is not None and solution_data is not None:
    # Ensure ID columns are of the same type and strip whitespace
    test_data['ID'] = test_data['ID'].astype(str).str.strip()
    solution_data['ID'] = solution_data['ID'].astype(str).str.strip()

    # Merge with solution_data to get movie names and actual genres
    merged_data = pd.merge(test_data, solution_data, on='ID')

    # Check for missing values and mismatched rows
    if merged_data.isnull().values.any():
        print("Warning: Merged data contains missing values.")
        print(merged_data[merged_data.isnull().any(axis=1)])

    # Ensure there are no mismatched genres
    if merged_data.empty:
        print("No matching IDs found between predictions and solution data.")
    else:
        # Print movie names for each predicted genre
        genres = merged_data['PREDICTED_GENRE'].unique()
        for genre in genres:
            genre_movies = merged_data[merged_data['PREDICTED_GENRE'] == genre]['GENRE'].unique()
            print(f"\nMovies for Predicted Genre '{genre}':")
            for movie in genre_movies:
                print(f"- {movie}")

else:
    print("Unable to process predictions due to missing files.")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
-  The Pixies Sell Out: 2004 Reunion Tour (2005) 
-  Neal and Michaele: The Winter Wonderland Wedding and Music Event (2013) 

Movies for Predicted Genre ' horror ':
-  Kids Get Dead 2: The Kids Get Deader (2014) 
-  The Ouija Experiment 2: Theatre of Death (2015) 
-  The Mummy's Kiss: 2nd Dynasty (2006) 
-  "Killers" (2016) 
-  Back Slash (2005) 
-  Unfinished Business (2016/I) 
-  Dying to Meet You (1997) 
-  The Paraclete (1996) 
-  Fast Zombies with Guns (2009) 
-  La casa nel tempo (1989) 
-  Zombie Armageddon: The Ultimate Collection (2015) 
-  Return of the Boogeyman (1994) 
-  The Retreat (????) 
-  Dead End Demon (2017) 
-  Pandora no tou: Kimi no moto e kaerumade (2011) 
-  Magicians of the Earth: Senis Children (1992) 
-  Coldwood (2012) 
-  The Curse of the Mummy's Tomb (1964) 
-  Shadows of the Moulin Rouge (1913) 
-  Doch (2012) 
-  Dead Mate (1988) 
-  Panzano (2000) 
-  Cagliostro (1975) 
-  Antropophagus 