In [None]:
import sys
sys.path.insert(0, "..")

import warnings
warnings.filterwarnings('ignore')

import pickle
import numpy as np
import pandas as pd
pd.set_option('display.max_colwidth', None)

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

from src.utils.preprocessing import preprocess
from scipy import sparse

RANDOM_SEED = 23

## __Read Data__

In [None]:
df = pd.read_csv("../data/movies_genres_en.csv")

## __Prepare Data__

In [None]:
train_df, test_df = train_test_split(df, test_size=0.15, random_state=RANDOM_SEED)

In [None]:
# Apply preprocessing function for cleaning data
X_train = (train_df['title'] + " " + train_df['plot']).apply(preprocess)
X_test = (test_df['title'] + " " + test_df['plot']).apply(preprocess)

In [None]:
X_train = X_train.values
X_test = X_test.values

y_train = train_df['genre'].values
y_test = test_df['genre'].values

### __Train a Classification Model__

In [None]:
tfidf = TfidfVectorizer(stop_words="english", min_df=2, max_df=0.9, ngram_range=(1, 2)) # Initialize feature vectorizer
clf = OneVsRestClassifier(LinearSVC(), n_jobs=-1) # Initialize classifier

In [None]:
X_train = tfidf.fit_transform(X_train) # Fit the vectorizer to the training data (Make it knows vocabularies)
clf.fit(X_train, y_train) # Train the classifier with training data

In [None]:
X_test = tfidf.transform(X_test) # Transform test data into feature vectors
y_pred = clf.predict(X_test) # Predict the labels of the test data
test_df['prediction'] = y_pred # Add the predicted labels to the test dataframe

In [None]:
print(classification_report(y_test, y_pred, digits=4)) # Show the classification report 

## __Save Prediction Logs__

In [None]:
train_df.to_csv("../logs/train_df.csv", index=False)
test_df.to_csv("../logs/test_df.csv", index=False)

### __Export Feature Vectorizer and Classifier__

In order to use these things with API, we have to export and save them into static format

In [None]:
pickle.dump(tfidf, open("../models/tfidf.pickle", "wb"))
pickle.dump(clf, open("../models/classifier.pickle", "wb"))
sparse.save_npz('../models/movie_vectors.npz', X_train)