In [64]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [156]:
import os
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report

In [66]:
current_dir = os.getcwd()
print(current_dir)

filepath_movies_genres = os.path.join(current_dir, "movies", "movies_genres.csv")
filepath_movies_overview = os.path.join(current_dir, "movies", "movies_overview.csv")

print(filepath_movies_genres)
print(filepath_movies_overview)

d:\Python
d:\Python\movies\movies_genres.csv
d:\Python\movies\movies_overview.csv


In [67]:
read_genres = pd.read_csv(filepath_movies_genres)
print(read_genres.columns)

Index(['id', 'name'], dtype='object')


In [68]:
print(read_genres.describe())

                 id
count     19.000000
mean    3389.736842
std     4980.456972
min       12.000000
25%       27.500000
50%       53.000000
75%    10025.000000
max    10770.000000


In [69]:
print(read_genres.head(4))

   id       name
0  28     Action
1  12  Adventure
2  16  Animation
3  35     Comedy


In [70]:
print(read_genres['id'].isna().sum())
print(read_genres['name'].isna().sum())

0
0


In [71]:
print(type(read_genres))

<class 'pandas.core.frame.DataFrame'>


In [72]:
read_overview = pd.read_csv(filepath_movies_overview)
print(read_overview.columns)

Index(['title', 'overview', 'genre_ids'], dtype='object')


In [73]:
print(type(read_overview))

<class 'pandas.core.frame.DataFrame'>


In [74]:
print(read_overview.head(4))

                      title  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   

                                            overview        genre_ids  
0  Imprisoned in the 1940s for the double murder ...         [18, 80]  
1  Spanning the years 1945 to 1955, a chronicle o...         [18, 80]  
2  In the continuing saga of the Corleone crime f...         [18, 80]  
3  The true story of how businessman Oskar Schind...  [18, 36, 10752]  


In [75]:
print(read_overview['genre_ids'].isna().sum())
print(read_overview['overview'].isna().sum())
print(read_overview['title'].isna().sum())

0
0
0


In [76]:
print(read_overview.head(4))
print(read_genres.head(4))

                      title  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   

                                            overview        genre_ids  
0  Imprisoned in the 1940s for the double murder ...         [18, 80]  
1  Spanning the years 1945 to 1955, a chronicle o...         [18, 80]  
2  In the continuing saga of the Corleone crime f...         [18, 80]  
3  The true story of how businessman Oskar Schind...  [18, 36, 10752]  
   id       name
0  28     Action
1  12  Adventure
2  16  Animation
3  35     Comedy


In [77]:
print(type(read_genres))
print(read_genres.size)

<class 'pandas.core.frame.DataFrame'>
38


In [78]:
print(type(read_overview))
print(read_overview.size)

<class 'pandas.core.frame.DataFrame'>
29940


In [79]:
print(read_overview[read_overview['genre_ids'].apply(lambda row : row == '[18, 80]')])

                                           title  \
0                       The Shawshank Redemption   
1                                  The Godfather   
2                          The Godfather Part II   
16                                    GoodFellas   
23                                   City of God   
28                   Once Upon a Time in America   
241                     The Young and the Damned   
319                          Catch Me If You Can   
334                                Trainspotting   
481                           Dancer in the Dark   
482                                 A Bronx Tale   
623                                      Polisse   
1002                                     Brother   
1046                           American Gangster   
1076                              Scarlet Street   
1233                                  Breathless   
1317                                  The Insult   
1360                Once Upon a Time in Anatolia   
1366        

In [80]:
read_overview['genre_ids'] = read_overview['genre_ids'].apply(ast.literal_eval)

In [81]:
print(read_overview.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9980 entries, 0 to 9979
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   title      9980 non-null   object
 1   overview   9980 non-null   object
 2   genre_ids  9980 non-null   object
dtypes: object(3)
memory usage: 234.0+ KB
None


In [82]:
print(read_overview[read_overview['genre_ids'].apply(lambda row : 18 in row or 80 in row)])

                         title  \
0     The Shawshank Redemption   
1                The Godfather   
2        The Godfather Part II   
3             Schindler's List   
4                 12 Angry Men   
...                        ...   
9944                Melissa P.   
9954        Dragon Wars: D-War   
9969                    Swiped   
9976               The Fanatic   
9977                    SPF-18   

                                               overview  \
0     Imprisoned in the 1940s for the double murder ...   
1     Spanning the years 1945 to 1955, a chronicle o...   
2     In the continuing saga of the Corleone crime f...   
3     The true story of how businessman Oskar Schind...   
4     The defense and the prosecution have rested an...   
...                                                 ...   
9944  An adolescent girl, living with her mother and...   
9954  Based on the Korean legend, unknown creatures ...   
9969  James, a college freshman and computer genius,...   
9

In [83]:
print(read_overview['genre_ids'].head(4))

0           [18, 80]
1           [18, 80]
2           [18, 80]
3    [18, 36, 10752]
Name: genre_ids, dtype: object


In [84]:
print(read_genres.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19 entries, 0 to 18
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      19 non-null     int64 
 1   name    19 non-null     object
dtypes: int64(1), object(1)
memory usage: 432.0+ bytes
None


In [85]:
read_genres = dict(zip(read_genres['id'], read_genres['name']))

In [86]:
print(read_genres)

{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}


In [87]:
print(type(read_genres))

<class 'dict'>


In [88]:
read_overview['genre_ids'] = read_overview['genre_ids'].apply(lambda row: [read_genres.get(id, 'Unknown') for id in row])

In [89]:
print(read_overview.head(4))

                      title  \
0  The Shawshank Redemption   
1             The Godfather   
2     The Godfather Part II   
3          Schindler's List   

                                            overview              genre_ids  
0  Imprisoned in the 1940s for the double murder ...         [Drama, Crime]  
1  Spanning the years 1945 to 1955, a chronicle o...         [Drama, Crime]  
2  In the continuing saga of the Corleone crime f...         [Drama, Crime]  
3  The true story of how businessman Oskar Schind...  [Drama, History, War]  


In [141]:
features = read_overview['overview']
print(features.shape)

(9980,)


In [142]:
target = read_overview['genre_ids']
print(target.shape)

(9980,)


In [143]:
vectorizer = CountVectorizer()

In [144]:
features = vectorizer.fit_transform(features)

In [145]:
mlb = MultiLabelBinarizer()

In [146]:
target = mlb.fit_transform(target)

In [147]:
x_train, x_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [150]:
from sklearn.multiclass import OneVsRestClassifier
model = OneVsRestClassifier(MultinomialNB())

In [151]:
model.fit(x_train, y_train)

In [152]:
y_predict = model.predict(x_test)

In [154]:
print("Model evalution: ")
print("Model accuracy (per label):")

Model evalution: 
Model accuracy (per label):


In [155]:
print((y_predict == y_test).mean(axis=0))

[0.84519038 0.85721443 0.92284569 0.78707415 0.86272545 0.72795591
 0.90681363 0.88927856 0.95440882 0.89228457 0.97294589 0.89529058
 0.84418838 0.91482966 0.98496994 0.77755511 0.97344689 0.98246493]


In [157]:
print("\nClassification report: ")
print(classification_report(y_test, y_predict, target_names=mlb.classes_))


Classification report: 
                 precision    recall  f1-score   support

         Action       0.69      0.65      0.67       476
      Adventure       0.58      0.46      0.51       326
      Animation       0.74      0.30      0.43       191
         Comedy       0.72      0.68      0.70       721
          Crime       0.63      0.42      0.50       333
          Drama       0.71      0.67      0.69       905
         Family       0.69      0.36      0.47       233
        Fantasy       0.53      0.24      0.33       228
        History       0.33      0.06      0.10        86
         Horror       0.77      0.39      0.52       297
          Music       0.67      0.04      0.07        55
        Mystery       0.42      0.11      0.17       200
        Romance       0.61      0.38      0.47       360
Science Fiction       0.73      0.50      0.59       248
       TV Movie       0.00      0.00      0.00        29
       Thriller       0.61      0.57      0.59       563
     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [158]:
from sklearn.metrics import accuracy_score

In [161]:
print(accuracy_score(y_predict, y_test))

0.14428857715430862
