In [1]:
import pandas as pd

def load_data(train_path, test_path, test_solution_path, description_path):
    train_data = pd.read_csv(train_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
    print("Training Data Loaded:")
    print(train_data.head())

    test_data = pd.read_csv(test_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'DESCRIPTION'])
    print("\nTesting Data Loaded:")
    print(test_data.head())

    test_solutions = pd.read_csv(test_solution_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'GENRE'])
    print("\nTest Data Solutions Loaded:")
    print(test_solutions.head())

    with open(description_path, 'r') as file:
        description = file.read()
    print("\nDescription File Loaded:")
    print(description[:500])

    return train_data, test_data, test_solutions, description

In [2]:
# Define file paths
train_path = '/content/drive/MyDrive/CODSOFT/train_data.txt'
test_path = '/content/drive/MyDrive/CODSOFT/test_data.txt'
test_solution_path = '/content/drive/MyDrive/CODSOFT/test_data_solution.txt'
description_path = '/content/drive/MyDrive/CODSOFT/description.txt'


In [3]:
# Load the data
train_data, test_data, test_solutions, description = load_data(train_path, test_path, test_solution_path, description_path)

  train_data = pd.read_csv(train_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])


Training Data Loaded:
   ID                               TITLE       GENRE  \
0   1       Oscar et la dame rose (2009)       drama    
1   2                       Cupid (1997)    thriller    
2   3   Young, Wild and Wonderful (1980)       adult    
3   4              The Secret Sin (1915)       drama    
4   5             The Unrecovered (2007)       drama    

                                         DESCRIPTION  
0   Listening in to a conversation between his do...  
1   A brother and sister with a past incestuous r...  
2   As the bus empties the students for their fie...  
3   To help their unemployed father make ends mee...  
4   The film's title refers not only to the un-re...  


  test_data = pd.read_csv(test_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'DESCRIPTION'])



Testing Data Loaded:
   ID                          TITLE  \
0   1          Edgar's Lunch (1998)    
1   2      La guerra de papá (1977)    
2   3   Off the Beaten Track (2010)    
3   4        Meu Amigo Hindu (2015)    
4   5             Er nu zhai (1955)    

                                         DESCRIPTION  
0   L.R. Brane loves his life - his car, his apar...  
1   Spain, March 1964: Quico is a very naughty ch...  
2   One year in the life of Albin and his family ...  
3   His father has died, he hasn't spoken with hi...  
4   Before he was known internationally as a mart...  


  test_solutions = pd.read_csv(test_solution_path, delimiter=':::', header=None, names=['ID', 'TITLE', 'GENRE'])



Test Data Solutions Loaded:
                              ID          TITLE  \
1          Edgar's Lunch (1998)       thriller    
2      La guerra de papá (1977)         comedy    
3   Off the Beaten Track (2010)    documentary    
4        Meu Amigo Hindu (2015)          drama    
5             Er nu zhai (1955)          drama    

                                               GENRE  
1   L.R. Brane loves his life - his car, his apar...  
2   Spain, March 1964: Quico is a very naughty ch...  
3   One year in the life of Albin and his family ...  
4   His father has died, he hasn't spoken with hi...  
5   Before he was known internationally as a mart...  

Description File Loaded:
Train data:
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION
ID ::: TITLE ::: GENRE ::: DESCRIPTION

Test data:
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION
ID ::: TITLE ::: DESCRIPTION

Source:
ftp

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

def preprocess_and_extract_features(train_data, test_data):
    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

    X_train = vectorizer.fit_transform(train_data['DESCRIPTION'])
    X_test = vectorizer.transform(test_data['DESCRIPTION'])

    return X_train, X_test, vectorizer

In [5]:
# Preprocess and extract features
X_train, X_test, vectorizer = preprocess_and_extract_features(train_data, test_data)

In [6]:
from sklearn.naive_bayes import MultinomialNB

def train_model(X_train, y_train):
    classifier = MultinomialNB()
    classifier.fit(X_train, y_train)
    return classifier

In [7]:
# Encode the target variable
y_train = train_data['GENRE']  # Assuming genre is in the 'GENRE' column

In [8]:
# Train the model
classifier = train_model(X_train, y_train)
print("Model Trained Successfully.")

Model Trained Successfully.


In [9]:
def predict_genres(classifier, X_test):
    y_pred = classifier.predict(X_test)
    return y_pred

In [10]:
# Predict genres for the test data
y_pred = predict_genres(classifier, X_test)

In [11]:
# Print the predicted genres
print("\nPredicted Genres:")
print(y_pred)


Predicted Genres:
[' drama ' ' drama ' ' documentary ' ... ' drama ' ' drama '
 ' documentary ']
