In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, StratifiedShuffleSplit
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
import unidecode
from nltk.corpus import stopwords 

from sklearn.feature_extraction.text import CountVectorizer
stop_words=stopwords.words('english');

In [29]:
pd.set_option('display.max_columns', 100) # Para mostrar todas las columnas
# pd.set_option('display.max_rows', 100) # Para mostrar todas las filas

In [30]:
data = pd.read_csv("data/movies_preprocesadas.csv", low_memory=False)
data.head()

Unnamed: 0,description_clean,genre_clean
0,miss jerry the adventures of a female reporter...,romance
1,the story of the kelly gang true story of noto...,biography crime drama
2,cleopatra the fabled queen of egypts affair wi...,drama history
3,linferno loosely adapted from dantes divine co...,adventure drama fantasy
4,from the manger to the cross or jesus of nazar...,biography drama


In [31]:
data['genre_clean'].value_counts()

drama                   11929
comedy                   7067
comedy drama             3856
drama romance            3349
comedy romance           2398
                        ...  
mystery scifi drama         1
history war western         1
crime comedy romance        1
family music                1
mystery crime drama         1
Name: genre_clean, Length: 1238, dtype: int64

In [32]:
data.shape

(82880, 2)

In [33]:
data.columns

Index(['description_clean', 'genre_clean'], dtype='object')

In [34]:
data['genre'] = data['genre_clean'].str.split(" ").map(lambda x: x[0])

In [35]:
data.sample(5)

Unnamed: 0,description_clean,genre_clean,genre
60357,demonic a police officer and a psychologist in...,horror mystery thriller,horror
3096,the kid from kokomo a fight promoter finds his...,comedy romance,comedy
65499,naduvula konjam pakkatha kaanom a young man fo...,comedy drama,comedy
56561,till det som ar vackert katarina is 20 years o...,drama,drama
1256,twofisted law after rob russell steals tim cla...,action adventure drama,action


In [36]:
genres = data['genre'].unique()
genres

array(['romance', 'biography', 'drama', 'adventure', 'crime', 'western',
       'fantasy', 'comedy', 'horror', 'family', 'action', 'mystery',
       'history', 'scifi', 'animation', 'musical', 'music', 'thriller',
       'war', 'filmnoir', 'sport'], dtype=object)

In [37]:
mask_filter = (data['genre']=='action') | (data['genre']=='comedy') | (data['genre']=='drama') | (data['genre']=='horror')
mask_filter
data = data.loc[mask_filter, :]

In [38]:
from sklearn.preprocessing import OrdinalEncoder
#Cuando le paso las categorías como paramétro me tira error The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
ord_enc = OrdinalEncoder()
data["target_genre"] = ord_enc.fit_transform(data[["genre"]])

data.sample(3)

Unnamed: 0,description_clean,genre_clean,genre,target_genre
70625,run raja run a cheated police officer who has ...,comedy drama,comedy,1.0
10794,una ragazza chiamata tamiko a eurasian photogr...,drama,drama,2.0
62208,zombie dawn the military calls upon a group of...,horror war,horror,3.0


In [39]:
vectorizer=CountVectorizer(stop_words=stop_words);
stop_words.append('family');
stop_words.append('la');
stop_words.append('woman');
stop_words.append('il');
stop_words.append('di');
clases = []

clases= data['genre'].unique()

for clase in range(0,len(clases)):
    X=vectorizer.fit_transform(data[data['target_genre']==clase]['description_clean']);
    counts=X.sum(axis=0);
    counts=np.array(counts);
    
    indices=np.argsort(counts);
    valores=np.sort(counts);
    indices=indices[0][::-1];
    valores=valores[0][::-1];
    terms=np.array(vectorizer.get_feature_names());

    print('\n Clase ',clases[clase-1])
    print(terms[indices[:30]])


 Clase  action
['man' 'young' 'two' 'one' 'life' 'police' 'war' 'must' 'group' 'find'
 'new' 'gang' 'world' 'story' 'love' 'cop' 'gets' 'get' 'son' 'take'
 'city' 'fight' 'father' 'agent' 'death' 'help' 'finds' 'crime' 'time'
 'back']

 Clase  drama
['love' 'life' 'two' 'young' 'man' 'one' 'new' 'friends' 'get' 'girl'
 'story' 'three' 'find' 'school' 'comedy' 'old' 'wife' 'years' 'lives'
 'finds' 'film' 'father' 'gets' 'time' 'day' 'town' 'world' 'home' 'back'
 'falls']

 Clase  comedy
['young' 'life' 'love' 'man' 'story' 'two' 'one' 'girl' 'new' 'lives'
 'father' 'years' 'mother' 'war' 'wife' 'world' 'old' 'son' 'film' 'find'
 'home' 'school' 'finds' 'friends' 'town' 'boy' 'daughter' 'three' 'falls'
 'day']

 Clase  horror
['young' 'one' 'group' 'house' 'man' 'night' 'killer' 'friends' 'new'
 'find' 'two' 'dead' 'home' 'mysterious' 'horror' 'death' 'girl' 'people'
 'life' 'town' 'evil' 'years' 'old' 'haunted' 'soon' 'three' 'dark'
 'strange' 'film' 'must']


In [40]:
train, test = train_test_split(data, random_state=42, test_size=0.30, shuffle=True)
train_text = train['description_clean']
test_text = test['description_clean']

In [41]:
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(min_df=10,ngram_range=(1,3),max_features=20000, stop_words=stop_words)
vectorizer.fit(train_text)

TfidfVectorizer(max_features=20000, min_df=10, ngram_range=(1, 3),
                stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...])

In [42]:
X_train = vectorizer.transform(train_text)
X_test = vectorizer.transform(test_text)
y_train = train['genre']
y_test = test['genre']

In [43]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import MultinomialNB

In [44]:
genres = data['genre'].unique()
len(genres)

4

In [45]:
classifier_log = LogisticRegression(penalty='l2', solver='lbfgs', multi_class='multinomial', verbose=1, n_jobs=-1)
classifier_log.fit(X_train, y_train)
prediction = classifier_log.predict(X_test)
print('Test accuracy is {}'.format(accuracy_score(y_test, prediction)))

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    4.0s finished


Test accuracy is 0.6261426431854568


In [46]:
print(classification_report(y_test, prediction, zero_division=1))

              precision    recall  f1-score   support

      action       0.70      0.58      0.63      3670
      comedy       0.62      0.64      0.63      6881
       drama       0.60      0.68      0.64      7271
      horror       0.64      0.41      0.50      1541

    accuracy                           0.63     19363
   macro avg       0.64      0.58      0.60     19363
weighted avg       0.63      0.63      0.62     19363



In [47]:
# pipelines

In [48]:
data_x = data[['description_clean']]
data_y = data.drop(['description_clean', 'genre_clean', 'target_genre'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(data_x, data_y, random_state=42, test_size=0.33, shuffle=True)

In [49]:
train_x = [x[0].strip() for x in np.array(X_train).tolist()]
test_x = [x[0].strip() for x in np.array(X_test).tolist()]
stop_words = stopwords.words('english')

In [50]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop_words)),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'))),
])
parameters = {
#     'tfidf__max_df': [50, 100,200],
    'tfidf__ngram_range': [(1, 1), (1, 2), (1, 3)],
    "clf__estimator__C": [0.01, 0.1, 1],
    "clf__estimator__class_weight": ['balanced', None],
}
# Se suma!
skf=StratifiedKFold(n_splits=3, random_state=3, shuffle=True)


In [51]:
grid = GridSearchCV(
    pipeline, parameters, cv=skf, n_jobs=2, verbose=1)
grid.fit(train_x, y_train)

Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  46 tasks      | elapsed:  4.3min
[Parallel(n_jobs=2)]: Done  54 out of  54 | elapsed:  5.1min finished


GridSearchCV(cv=StratifiedKFold(n_splits=3, random_state=3, shuffle=True),
             estimator=Pipeline(steps=[('tfidf',
                                        TfidfVectorizer(stop_words=['i', 'me',
                                                                    'my',
                                                                    'myself',
                                                                    'we', 'our',
                                                                    'ours',
                                                                    'ourselves',
                                                                    'you',
                                                                    "you're",
                                                                    "you've",
                                                                    "you'll",
                                                                    "you'd",
                     

In [52]:
print("Best parameters set:")
grid.best_estimator_.steps

Best parameters set:


[('tfidf',
  TfidfVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                              'ourselves', 'you', "you're", "you've", "you'll",
                              "you'd", 'your', 'yours', 'yourself', 'yourselves',
                              'he', 'him', 'his', 'himself', 'she', "she's",
                              'her', 'hers', 'herself', 'it', "it's", 'its',
                              'itself', ...])),
 ('clf',
  OneVsRestClassifier(estimator=LogisticRegression(C=1, class_weight='balanced',
                                                   solver='sag')))]

In [53]:
# measuring performance on test set
print("Applying best classifier on test data:")
predictions = grid.best_estimator_.predict(test_x)

print(classification_report(y_test, predictions, target_names=genres))

Applying best classifier on test data:
              precision    recall  f1-score   support

       drama       0.64      0.68      0.66      4032
      comedy       0.66      0.62      0.64      7549
      horror       0.65      0.63      0.64      8027
      action       0.52      0.70      0.60      1691

    accuracy                           0.64     21299
   macro avg       0.62      0.66      0.63     21299
weighted avg       0.64      0.64      0.64     21299



In [54]:
print("Accuracy = ",accuracy_score(y_test,predictions))
print("\n")

Accuracy =  0.6400769989201371


