In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import unidecode
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import CountVectorizer
stop_words=stopwords.words('english');

In [2]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [3]:
data = pd.read_csv("data/movies_preprocesadas.csv", low_memory=False)
data.head()

Unnamed: 0,description_clean,genre_clean
0,miss jerry the adventures of a female reporter...,romance
1,the story of the kelly gang true story of noto...,biography crime drama
2,cleopatra the fabled queen of egypts affair wi...,drama history
3,linferno loosely adapted from dantes divine co...,adventure drama fantasy
4,from the manger to the cross or jesus of nazar...,biography drama


In [4]:
data['genre_clean'].value_counts()

drama                       11929
comedy                       7067
comedy drama                 3856
drama romance                3349
comedy romance               2398
                            ...  
biography family history        1
musical thriller mystery        1
thriller musical mystery        1
animation comedy action         1
crime history western           1
Name: genre_clean, Length: 1240, dtype: int64

In [5]:
data.shape

(82882, 2)

In [6]:
data.columns

Index(['description_clean', 'genre_clean'], dtype='object')

In [7]:
data['genre'] = data['genre_clean'].str.split(" ").map(lambda x: x[0])

In [8]:
data.sample(5)

Unnamed: 0,description_clean,genre_clean,genre
19880,barbarosa a young cowboy hooks up with a legen...,western,western
28735,stadtgesprach monika the host of a daily radio...,comedy romance,comedy
15688,il pedone when a german businessman causes a c...,drama,drama
17966,i cacciatori delloceano based on the childrens...,adventure drama family,adventure
32080,dragon ball z la vendetta divina in order to w...,animation action adventure,animation


In [9]:
genres = data['genre'].unique()
genres

array(['romance', 'biography', 'drama', 'adventure', 'crime', 'western',
       'fantasy', 'comedy', 'horror', 'family', 'action', 'mystery',
       'history', 'scifi', 'animation', 'musical', 'music', 'thriller',
       'war', 'filmnoir', 'sport', 'adult'], dtype=object)

In [10]:
from sklearn.preprocessing import OrdinalEncoder
#Cuando le paso las categorías como paramétro me tira error The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
ord_enc = OrdinalEncoder()
data["target_genre"] = ord_enc.fit_transform(data[["genre"]])

data.sample(3)

Unnamed: 0,description_clean,genre_clean,genre,target_genre
23006,pociag do hollywood a young woman is bitten by...,comedy romance,comedy,5.0
77098,a haunting at silver falls 2 several years aft...,horror,horror,12.0
41943,la storia di jack e rose a father and daughter...,drama,drama,7.0


In [11]:
vectorizer=CountVectorizer(stop_words=stop_words);

clases = []

clases= data['genre'].unique()

for clase in range(0,len(clases)):
    X=vectorizer.fit_transform(data[data['target_genre']==clase]['description_clean']);
    counts=X.sum(axis=0);
    counts=np.array(counts);
    
    indices=np.argsort(counts);
    valores=np.sort(counts);
    indices=indices[0][::-1];
    valores=valores[0][::-1];
    terms=np.array(vectorizer.get_feature_names());

    print('\n Clase ',clases[clase-1])
    print(terms[indices[:30]])


 Clase  adult
['man' 'young' 'two' 'one' 'life' 'police' 'war' 'must' 'group' 'find'
 'new' 'gang' 'il' 'world' 'la' 'story' 'family' 'woman' 'love' 'cop' 'di'
 'gets' 'get' 'son' 'take' 'city' 'fight' 'father' 'agent' 'death']

 Clase  romance
['house' 'person' 'heroin' 'happy' 'happen' 'handed' 'girls' 'film'
 'everywhere' 'every' 'young' 'keep' 'dvaergen' 'de' 'day' 'couple'
 'cigarettes' 'chance' 'boarding' 'also' 'escape' 'la' 'white' 'ronde'
 'view' 'try' 'traces' 'string' 'smuggle' 'slavery']

 Clase  biography
['young' 'il' 'la' 'two' 'di' 'find' 'man' 'world' 'life' 'boy' 'new'
 'one' 'war' 'family' 'girl' 'story' 'must' 'friends' 'island' 'father'
 'group' 'del' 'american' 'love' 'time' 'three' 'old' 'finds' 'woman'
 'help']

 Clase  drama
['world' 'young' 'il' 'friends' 'must' 'new' 'la' 'one' 'find' 'life'
 'boy' 'story' 'save' 'film' 'evil' 'girl' 'di' 'movie' 'adventure' 'two'
 'love' 'family' 'help' 'little' 'named' 'earth' 'home' 'time' 'city'
 'back']

 Clase  adventu

In [12]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['description_clean']
test_text = test['description_clean']

In [13]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,3),max_features=20000, stop_words=stop_words)
vectorizer.fit(train_text)

TfidfVectorizer(max_features=20000, min_df=10, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [14]:
X_train = vectorizer.transform(train_text)
X_test = vectorizer.transform(test_text)
y_train = train['genre']
y_test = test['genre']

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [16]:
genres = data['genre'].unique()
len(genres)

22

In [17]:
classifier_log = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='multinomial', verbose=1, n_jobs=-1)
classifier_log.fit(X_train, y_train)
prediction = classifier_log.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Test accuracy is 0.5134526442791072


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:   16.9s finished


In [18]:
print(classification_report(y_test, prediction, zero_division=1))

              precision    recall  f1-score   support

      action       0.54      0.54      0.54      3647
   adventure       0.41      0.12      0.19      1020
   animation       0.63      0.17      0.27       656
   biography       0.65      0.17      0.26       609
      comedy       0.53      0.65      0.59      6971
       crime       0.39      0.19      0.26      1615
       drama       0.49      0.68      0.57      7314
      family       1.00      0.00      0.00       171
     fantasy       1.00      0.00      0.00       144
    filmnoir       1.00      0.00      0.00        11
     history       1.00      0.00      0.00        29
      horror       0.54      0.41      0.46      1455
       music       1.00      0.00      0.00        21
     musical       1.00      0.00      0.00       100
     mystery       1.00      0.00      0.00       168
     romance       1.00      0.00      0.00       225
       scifi       0.20      0.01      0.01       134
       sport       1.00    