In [None]:
#Clasificación

In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import unidecode
from nltk.corpus import stopwords 

In [4]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [5]:
data = pd.read_csv("Data/movies_usa_en.csv", low_memory=False)
data.head()

Unnamed: 0,title,description_clean,genre_clean,romance,drama,history,biography,crime,horror,western,fantasy,comedy,family,adventure,action,war,scifi,mystery,thriller,sport,musical,music,filmnoir,animation
0,Miss Jerry,the adventures of a female reporter in the 1890s,romance,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Cleopatra,the fabled queen of egypts affair with roman g...,drama history,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"From the Manger to the Cross; or, Jesus of Naz...",an account of the life of jesus christ based o...,biography drama,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Amore di madre,john howard payne at his most miserable point ...,drama,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Traffic in Souls,a woman with the aid of her police officer swe...,crime drama,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
data.shape

(38075, 24)

In [7]:
data.columns

Index(['title', 'description_clean', 'genre_clean', 'romance', 'drama',
       'history', 'biography', 'crime', 'horror', 'western', 'fantasy',
       'comedy', 'family', 'adventure', 'action', 'war', 'scifi', 'mystery',
       'thriller', 'sport', 'musical', 'music', 'filmnoir', 'animation'],
      dtype='object')

### Split en train y test y vectorizacion de texto

In [8]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['description_clean']
test_text = test['description_clean']

In [9]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', stop_words=stop_words)
vectorizer.fit(train_text)

TfidfVectorizer(ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                strip_accents='unicode')

In [10]:
X_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['title', 'description_clean', 'genre_clean'], axis=1)

X_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['title', 'description_clean', 'genre_clean'], axis=1)

### Pipeline de clasificación

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [12]:
genres = pd.unique(data['genre_clean'].str.split(expand=True).stack())
genres

array(['romance', 'drama', 'history', 'biography', 'crime', 'horror',
       'western', 'fantasy', 'comedy', 'family', 'adventure', 'action',
       'war', 'scifi', 'mystery', 'thriller', 'sport', 'musical', 'music',
       'filmnoir', 'animation'], dtype=object)

In [13]:
len(genres)

21

In [14]:
classifier_log = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)

for genre in genres:
    print('**Processing {} movies...**'.format(genre))
    
    # Training logistic regression model on train data
    classifier_log.fit(X_train, train[genre])
    
    # calculating test accuracy
    prediction = classifier_log.predict(X_test)
    print('Test accuracy is {}'.format(accuracy_score(test[genre], prediction)))

**Processing romance movies...**
Test accuracy is 0.8242143044734308
**Processing drama movies...**
Test accuracy is 0.6704018208876827
**Processing history movies...**
Test accuracy is 0.9843298608071435
**Processing biography movies...**
Test accuracy is 0.9768011905804079
**Processing crime movies...**
Test accuracy is 0.8698240392191193
**Processing horror movies...**
Test accuracy is 0.8933730193469316
**Processing western movies...**
Test accuracy is 0.9695351483848376
**Processing fantasy movies...**
Test accuracy is 0.962006478158102
**Processing comedy movies...**
Test accuracy is 0.7280049023899151
**Processing family movies...**
Test accuracy is 0.9524643263591
**Processing adventure movies...**
Test accuracy is 0.9134202923925414
**Processing action movies...**
Test accuracy is 0.8432986080714349
**Processing war movies...**
Test accuracy is 0.9785520441215092
**Processing scifi movies...**
Test accuracy is 0.9529020397443754
**Processing mystery movies...**
Test accuracy i

In [15]:
classifier_log2 = OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1)

In [16]:
classifier_log2.fit(X_train, y_train)

OneVsRestClassifier(estimator=LogisticRegression(solver='sag'), n_jobs=-1)

In [17]:
predictions = classifier_log2.predict(X_test)

In [18]:
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.1271995097610085




In [38]:
# calculating test accuracy

print(classification_report(y_test, predictions, target_names=genres, zero_division=1))

              precision    recall  f1-score   support

     romance       0.64      0.21      0.32      2214
       drama       0.65      0.73      0.69      5757
     history       1.00      0.00      0.00       179
   biography       0.82      0.03      0.06       272
       crime       0.65      0.23      0.33      1656
      horror       0.81      0.32      0.46      1611
     western       0.88      0.02      0.04       354
     fantasy       1.00      0.00      0.00       434
      comedy       0.71      0.35      0.47      3940
      family       0.50      0.00      0.00       543
   adventure       0.91      0.03      0.06      1017
      action       0.71      0.16      0.26      1977
         war       0.47      0.11      0.18       242
       scifi       0.88      0.11      0.19       593
     mystery       0.45      0.03      0.06       781
    thriller       0.52      0.06      0.10      1816
       sport       1.00      0.00      0.01       203
     musical       1.00    

In [40]:
from skmultilearn.problem_transform import ClassifierChain

In [None]:
classifier2 = ClassifierChain(LogisticRegression())

# Training logistic regression model on train data
classifier2.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

# accuracy
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

In [None]:
print(classification_report(y_test, predictions, target_names=genres, zero_division=1))