In [None]:
#Clasificación

In [1]:
import pandas as pd
import numpy as np

import unidecode
from nltk.corpus import stopwords 

In [2]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [3]:
data = pd.read_csv("Data/movies_usa_en.csv", low_memory=False)
data.head()

Unnamed: 0,title,description_clean,genre_clean,romance,drama,history,biography,crime,horror,western,fantasy,comedy,family,adventure,action,war,scifi,mystery,thriller,sport,musical,music,filmnoir,animation
0,Miss Jerry,the adventures of a female reporter in the 1890s,romance,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Cleopatra,the fabled queen of egypts affair with roman g...,drama history,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,"From the Manger to the Cross; or, Jesus of Naz...",an account of the life of jesus christ based o...,biography drama,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Amore di madre,john howard payne at his most miserable point ...,drama,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Traffic in Souls,a woman with the aid of her police officer swe...,crime drama,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
data.shape

(38075, 24)

In [5]:
data.columns

Index(['title', 'description_clean', 'genre_clean', 'romance', 'drama',
       'history', 'biography', 'crime', 'horror', 'western', 'fantasy',
       'comedy', 'family', 'adventure', 'action', 'war', 'scifi', 'mystery',
       'thriller', 'sport', 'musical', 'music', 'filmnoir', 'animation'],
      dtype='object')

In [24]:
genres = pd.unique(data['genre_clean'].str.split(expand=True).stack())
genres

array(['romance', 'drama', 'history', 'biography', 'crime', 'horror',
       'western', 'fantasy', 'comedy', 'family', 'adventure', 'action',
       'war', 'scifi', 'mystery', 'thriller', 'sport', 'musical', 'music',
       'filmnoir', 'animation'], dtype=object)

In [31]:
data.columns[3:]

Index(['romance', 'drama', 'history', 'biography', 'crime', 'horror',
       'western', 'fantasy', 'comedy', 'family', 'adventure', 'action', 'war',
       'scifi', 'mystery', 'thriller', 'sport', 'musical', 'music', 'filmnoir',
       'animation'],
      dtype='object')

In [33]:
counts = []
for genre in data.columns[3:]:
    counts.append((genre, data[genre].sum()))
df_stats = pd.DataFrame(counts, columns=['genre', 'number of movies'])
df_stats

Unnamed: 0,genre,number of movies
0,romance,7068
1,drama,19429
2,history,632
3,biography,928
4,crime,5615
5,horror,5433
6,western,1131
7,fantasy,1454
8,comedy,13020
9,family,1690


In [19]:
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.multiclass import OneVsRestClassifier

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score,confusion_matrix, classification_report

In [34]:

data_x = data[['description_clean']]
data_y = data.drop(['title', 'description_clean', 'genre_clean'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=42, test_size=0.33, shuffle=True)



In [38]:
train_x = [x[0].strip() for x in np.array(X_train).tolist()]


In [39]:
test_x = [x[0].strip() for x in np.array(X_test).tolist()]

In [10]:
stop_words = stopwords.words('english')

### Pipeline 1 con OneVSRest y MultinomialNB

In [44]:
pipeline1 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(MultinomialNB(
        fit_prior=True, class_prior=None))),
])
parameters1 = {
    'tfidf__max_df': [50, 100, 200],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
#     'tfidf__norm':['l1', 'l2']
    'clf__estimator__alpha': (1e-2, 1e-3)
}

In [46]:
grid1 = GridSearchCV(
    pipeline1, parameters1, cv=3, n_jobs=2, verbose=1)
grid1.fit(train_x, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:   53.0s
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  1.1min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
 

In [48]:
print("Best parameters set:")
grid1.best_estimator_.steps

Best parameters set:


[('tfidf',
  TfidfVectorizer(max_df=200, ngram_range=(1, 2),
                  stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                              'ourselves', 'you', "you're", "you've", "you'll",
                              "you'd", 'your', 'yours', 'yourself', 'yourselves',
                              'he', 'him', 'his', 'himself', 'she', "she's",
                              'her', 'hers', 'herself', 'it', "it's", 'its',
                              'itself', ...])),
 ('clf', OneVsRestClassifier(estimator=MultinomialNB(alpha=0.01)))]

In [52]:
# measuring performance on test set
print("Applying best classifier on test data:")
best_clf = grid1.best_estimator_
predictions = best_clf.predict(test_x)

print(classification_report(y_test, predictions, target_names=genres))

Applying best classifier on test data:
              precision    recall  f1-score   support

     romance       0.47      0.18      0.26      2420
       drama       0.63      0.66      0.65      6348
     history       0.53      0.05      0.08       196
   biography       0.84      0.09      0.16       300
       crime       0.58      0.24      0.34      1837
      horror       0.72      0.41      0.53      1763
     western       0.76      0.26      0.39       390
     fantasy       0.41      0.04      0.07       488
      comedy       0.61      0.43      0.50      4335
      family       0.47      0.04      0.07       583
   adventure       0.62      0.18      0.28      1114
      action       0.56      0.27      0.36      2153
         war       0.69      0.20      0.31       267
       scifi       0.74      0.23      0.35       662
     mystery       0.41      0.07      0.12       867
    thriller       0.40      0.14      0.20      1966
       sport       0.74      0.07      0.1

  _warn_prf(average, modifier, msg_start, len(result))


In [77]:
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.12638280939116595




### Pipeline 2 con OneVSRest y LinearCV

In [55]:
pipeline2 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LinearSVC()))
])
parameters2 = {
    'tfidf__max_df': [50,100,200],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}

In [56]:
grid2 = GridSearchCV(
    pipeline2, parameters2, cv=3, n_jobs=2, verbose=1)
grid2.fit(train_x, y_train)

Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 108 out of 108 | elapsed:  4.4min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
 

In [58]:
print("Best parameters set:")
grid2.best_estimator_.steps

Best parameters set:


[('tfidf',
  TfidfVectorizer(max_df=100, ngram_range=(1, 2),
                  stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                              'ourselves', 'you', "you're", "you've", "you'll",
                              "you'd", 'your', 'yours', 'yourself', 'yourselves',
                              'he', 'him', 'his', 'himself', 'she', "she's",
                              'her', 'hers', 'herself', 'it', "it's", 'its',
                              'itself', ...])),
 ('clf', OneVsRestClassifier(estimator=LinearSVC(C=1)))]

In [61]:
# measuring performance on test set
print( "Applying best classifier on test data:")
predictions = grid2.best_estimator_.predict(test_x)

print(classification_report(y_test, predictions, target_names=genres))

Applying best classifier on test data:
              precision    recall  f1-score   support

     romance       0.54      0.17      0.26      2420
       drama       0.65      0.68      0.66      6348
     history       0.67      0.01      0.02       196
   biography       0.77      0.12      0.21       300
       crime       0.62      0.27      0.38      1837
      horror       0.78      0.48      0.59      1763
     western       0.73      0.36      0.49       390
     fantasy       0.58      0.04      0.08       488
      comedy       0.66      0.45      0.54      4335
      family       0.58      0.03      0.06       583
   adventure       0.66      0.17      0.27      1114
      action       0.62      0.29      0.39      2153
         war       0.66      0.21      0.32       267
       scifi       0.77      0.28      0.42       662
     mystery       0.52      0.06      0.11       867
    thriller       0.45      0.15      0.22      1966
       sport       0.74      0.27      0.3

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


### Pipeline 3 con OneVSRest y LogisticRegression

In [65]:
pipeline3 = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters3 = {
#     'tfidf__max_df': [50, 100,200],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}


In [66]:
grid3 = GridSearchCV(
    pipeline3, parameters3, cv=3, n_jobs=2, verbose=1)
grid3.fit(train_x, y_train)



Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  5.1min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  5.8min finished


GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                                                                    'your',
 

In [68]:
print("Best parameters set:")
grid3.best_estimator_.steps

Best parameters set:


[('tfidf',
  TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                              'ourselves', 'you', "you're", "you've", "you'll",
                              "you'd", 'your', 'yours', 'yourself', 'yourselves',
                              'he', 'him', 'his', 'himself', 'she', "she's",
                              'her', 'hers', 'herself', 'it', "it's", 'its',
                              'itself', ...])),
 ('clf', OneVsRestClassifier(estimator=LogisticRegression(C=1, solver='sag')))]

In [69]:
# measuring performance on test set
print("Applying best classifier on test data:")
predictions = grid3.best_estimator_.predict(test_x)

print(classification_report(y_test, predictions, target_names=genres))

Applying best classifier on test data:
              precision    recall  f1-score   support

     romance       0.64      0.17      0.27      2420
       drama       0.67      0.71      0.69      6348
     history       0.00      0.00      0.00       196
   biography       0.79      0.05      0.09       300
       crime       0.67      0.21      0.32      1837
      horror       0.83      0.33      0.48      1763
     western       0.76      0.07      0.14       390
     fantasy       0.00      0.00      0.00       488
      comedy       0.72      0.41      0.52      4335
      family       0.62      0.01      0.02       583
   adventure       0.79      0.06      0.12      1114
      action       0.71      0.19      0.30      2153
         war       0.55      0.10      0.18       267
       scifi       0.84      0.15      0.26       662
     mystery       0.51      0.04      0.08       867
    thriller       0.54      0.07      0.12      1966
       sport       0.73      0.05      0.1

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [70]:
pipeline4 = Pipeline([
    ('tfidf', TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2', stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), n_jobs=-1))])




In [73]:
pipeline4.fit(train_x, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(ngram_range=(1, 3),
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                                             "she's", 'her', 'hers', 'herself',
                                             'it', "it's", 'its', 'itself', ...],
                                 strip_accents='unicode')),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='sag'),
                                     n_jobs=-1))])

In [75]:
predictions = pipeline4.predict(test_x)

print(classification_report(y_test, predictions, target_names=genres))

              precision    recall  f1-score   support

     romance       0.62      0.21      0.31      2420
       drama       0.66      0.73      0.69      6348
     history       0.00      0.00      0.00       196
   biography       0.82      0.03      0.06       300
       crime       0.65      0.22      0.33      1837
      horror       0.81      0.31      0.45      1763
     western       1.00      0.02      0.03       390
     fantasy       0.00      0.00      0.00       488
      comedy       0.72      0.34      0.46      4335
      family       0.00      0.00      0.00       583
   adventure       0.89      0.03      0.06      1114
      action       0.70      0.15      0.25      2153
         war       0.47      0.10      0.17       267
       scifi       0.86      0.10      0.17       662
     mystery       0.46      0.03      0.05       867
    thriller       0.52      0.05      0.10      1966
       sport       1.00      0.00      0.01       212
     musical       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [76]:
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.12638280939116595


