In [None]:
#Clasificación

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import unidecode
from nltk.corpus import stopwords 

In [2]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [3]:
data = pd.read_csv("Data/movies_usa_en.csv", low_memory=False)
data.head()

Unnamed: 0,title,description_clean,genre_clean,romance,drama,history,biography,crime,horror,western,fantasy,comedy,family,adventure,action,war,scifi,mystery,thriller,sport,musical,music,filmnoir,animation
0,Miss Jerry,the adventures of a female reporter in the 1890s,romance,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Cleopatra,the fabled queen of egypts affair with roman g...,drama history,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"From the Manger to the Cross; or, Jesus of Naz...",an account of the life of jesus christ based o...,biography drama,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Amore di madre,john howard payne at his most miserable point ...,drama,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Traffic in Souls,a woman with the aid of her police officer swe...,crime drama,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
data.shape

(38075, 24)

In [5]:
data.columns

Index(['title', 'description_clean', 'genre_clean', 'romance', 'drama',
       'history', 'biography', 'crime', 'horror', 'western', 'fantasy',
       'comedy', 'family', 'adventure', 'action', 'war', 'scifi', 'mystery',
       'thriller', 'sport', 'musical', 'music', 'filmnoir', 'animation'],
      dtype='object')

### Split en train y test y vectorizacion de texto

In [None]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['description_clean']
test_text = test['description_clean']

In [None]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', stop_words=stop_words)
vectorizer.fit(train_text)

In [None]:
X_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['title', 'description_clean', 'genre_clean'], axis=1)

X_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['title', 'description_clean', 'genre_clean'], axis=1)

### Pipeline de clasificación

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [None]:
genres = pd.unique(data['genre_clean'].str.split(expand=True).stack())
genres

In [None]:
len(genres)

In [None]:
classifier_log = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)

for genre in genres:
    print('**Processing {} movies...**'.format(genre))
    
    # Training logistic regression model on train data
    classifier_log.fit(X_train, train[genre])
    
    # calculating test accuracy
    prediction = classifier_log.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))

In [None]:
classifier_log2 = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)

In [None]:
classifier_log2.fit(X_train, y_train)

In [None]:
predictions = classifier_log2.predict(X_test)

In [None]:
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

In [None]:
# calculating test accuracy

print(classification_report(y_test, predictions, target_names=genres, zero_division=1))

In [None]:
from skmultilearn.problem_transform import ClassifierChain

In [None]:
classifier2 = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
classifier2.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

In [None]:
print(classification_report(y_test, predictions, target_names=genres, zero_division=1))