In [160]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import unidecode
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import CountVectorizer
stop_words=stopwords.words('english');

In [161]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [162]:
data = pd.read_csv("data/movies_preprocesadas.csv", low_memory=False)
data.head()

Unnamed: 0,description_clean,genre_clean
0,miss jerry the adventures of a female reporter...,romance
1,the story of the kelly gang true story of noto...,biography crime drama
2,cleopatra the fabled queen of egypts affair wi...,drama history
3,linferno loosely adapted from dantes divine co...,adventure drama fantasy
4,from the manger to the cross or jesus of nazar...,biography drama


In [163]:
data['genre_clean'].value_counts()

drama                     11929
comedy                     7067
comedy drama               3856
drama romance              3349
comedy romance             2398
                          ...  
fantasy animation             1
drama fantasy family          1
action mystery western        1
horror romance western        1
comedy music mystery          1
Name: genre_clean, Length: 1238, dtype: int64

In [164]:
data.shape

(82880, 2)

In [165]:
data.columns

Index(['description_clean', 'genre_clean'], dtype='object')

In [166]:
data['genre'] = data['genre_clean'].str.split(" ").map(lambda x: x[0])

In [167]:
data.sample(5)

Unnamed: 0,description_clean,genre_clean,genre
17693,the last dinosaur a wealthy big game hunter an...,action adventure scifi,action
44171,return of the living dead necropolis a group o...,action comedy horror,action
43704,a history of violence a mildmannered man becom...,drama thriller,drama
34825,il salvatore in 1943 rural french teenager nan...,drama war,drama
41448,runaways runaways is steve moores story hes si...,drama romance,drama


In [168]:
genres = data['genre'].unique()
genres

array(['romance', 'biography', 'drama', 'adventure', 'crime', 'western',
       'fantasy', 'comedy', 'horror', 'family', 'action', 'mystery',
       'history', 'scifi', 'animation', 'musical', 'music', 'thriller',
       'war', 'filmnoir', 'sport'], dtype=object)

In [169]:
mask_filter = (data['genre']=='action') | (data['genre']=='comedy') | (data['genre']=='drama') | (data['genre']=='horror')
mask_filter
data = data.loc[mask_filter, :]

In [170]:
from sklearn.preprocessing import OrdinalEncoder
#Cuando le paso las categorías como paramétro me tira error The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
ord_enc = OrdinalEncoder()
data["target_genre"] = ord_enc.fit_transform(data[["genre"]])

data.sample(3)

Unnamed: 0,description_clean,genre_clean,genre,target_genre
25583,mio caro dottor grasler a preworld war i physi...,drama,drama,2.0
58621,kolka cool summer story about two brothers and...,comedy drama,comedy,1.0
26619,calendar a photographer and his wife take phot...,comedy drama romance,comedy,1.0


In [171]:
vectorizer=CountVectorizer(stop_words=stop_words);
stop_words.append('family');
stop_words.append('la');
stop_words.append('woman');
stop_words.append('il');
stop_words.append('di');
clases = []

clases= data['genre'].unique()

for clase in range(0,len(clases)):
    X=vectorizer.fit_transform(data[data['target_genre']==clase]['description_clean']);
    counts=X.sum(axis=0);
    counts=np.array(counts);
    
    indices=np.argsort(counts);
    valores=np.sort(counts);
    indices=indices[0][::-1];
    valores=valores[0][::-1];
    terms=np.array(vectorizer.get_feature_names());

    print('\n Clase ',clases[clase-1])
    print(terms[indices[:30]])


 Clase  action
['man' 'young' 'two' 'one' 'life' 'police' 'war' 'must' 'group' 'find'
 'new' 'gang' 'world' 'story' 'love' 'cop' 'gets' 'get' 'son' 'take'
 'city' 'fight' 'father' 'agent' 'death' 'help' 'finds' 'crime' 'time'
 'back']

 Clase  drama
['love' 'life' 'two' 'young' 'man' 'one' 'new' 'friends' 'get' 'girl'
 'story' 'three' 'find' 'school' 'comedy' 'old' 'wife' 'years' 'lives'
 'finds' 'film' 'father' 'gets' 'time' 'day' 'town' 'world' 'home' 'back'
 'falls']

 Clase  comedy
['young' 'life' 'love' 'man' 'story' 'two' 'one' 'girl' 'new' 'lives'
 'father' 'years' 'mother' 'war' 'wife' 'world' 'old' 'son' 'film' 'find'
 'home' 'school' 'finds' 'friends' 'town' 'boy' 'daughter' 'three' 'falls'
 'day']

 Clase  horror
['young' 'one' 'group' 'house' 'man' 'night' 'killer' 'friends' 'new'
 'find' 'two' 'dead' 'home' 'mysterious' 'horror' 'death' 'girl' 'people'
 'life' 'town' 'evil' 'years' 'old' 'haunted' 'soon' 'three' 'dark'
 'strange' 'film' 'must']


In [172]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['description_clean']
test_text = test['description_clean']

In [173]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,3),max_features=20000, stop_words=stop_words)
vectorizer.fit(train_text)

TfidfVectorizer(max_features=20000, min_df=10, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [174]:
X_train = vectorizer.transform(train_text)
X_test = vectorizer.transform(test_text)
y_train = train['genre']
y_test = test['genre']

In [175]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [176]:
genres = data['genre'].unique()
len(genres)

4

In [177]:
classifier_log = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='multinomial', verbose=1, n_jobs=-1)
classifier_log.fit(X_train, y_train)
prediction = classifier_log.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    3.6s finished


Test accuracy is 0.6261426431854568


In [178]:
print(classification_report(y_test, prediction, zero_division=1))

              precision    recall  f1-score   support

      action       0.70      0.58      0.63      3670
      comedy       0.62      0.64      0.63      6881
       drama       0.60      0.68      0.64      7271
      horror       0.64      0.41      0.50      1541

    accuracy                           0.63     19363
   macro avg       0.64      0.58      0.60     19363
weighted avg       0.63      0.63      0.62     19363

