In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('stopwords')
nltk.download('punkt')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mpoli\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mpoli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
tf_data = pd.read_csv("/Users/mpoli/Desktop/MASTER/DIPLOMATIKI/course_venv/API/API_csvs/tmdb_data.csv")


In [3]:
tf_data.columns

Index(['Unnamed: 0', 'budget', 'homepage', 'id', 'imdb_id', 'revenue',
       'runtime', 'language', 'description', 'popularity', 'title', 'rating',
       'votes', 'year', 'genre1', 'genre2', 'genre3', 'genre4', 'genre5',
       'genre6', 'genre7', 'genre8'],
      dtype='object')

##### first we will merge the columns with the genres in order to apply the new column in the tf-idf method

In [4]:
tf_data.fillna('empty', inplace=True)
tf_data["genres"] = (
    tf_data[['genre1', 'genre2', 'genre3', 'genre4', 'genre5', 'genre6', 'genre7', 'genre8']]
        .apply(lambda x: x.str.split(',\s*'))
        .sum(axis=1).map(np.array)
        .apply(lambda x: ', '.join(np.unique(x[x != 'empty'])))        
        .replace('', 'empty')
    )
tf_data.genres

0                  Action, Adventure, ScienceFiction
1                    Action, Comedy, Crime, Thriller
2                         Adventure, Family, Fantasy
3         Action, Adventure, Fantasy, ScienceFiction
4                         Action, Adventure, Fantasy
                            ...                     
9867                          Drama, Family, Romance
9868                         Crime, Horror, Thriller
9869        Drama, Mystery, ScienceFiction, Thriller
9870                  Comedy, Horror, ScienceFiction
9871    Adventure, Animation, Crime, Family, Mystery
Name: genres, Length: 9872, dtype: object

##### Because this is a multi-label classification problem we will treat thie as a Binary Relevance problem

In [5]:
from sklearn.preprocessing import MultiLabelBinarizer

multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(tf_data['genres'])

# transform target variable
y = multilabel_binarizer.transform(tf_data['genres'])

In [6]:
len(tf_data)

9872

##### description column transform into string type from object type

In [7]:
tf_data['description'] = tf_data['description'].astype("string")
tf_data.dtypes

Unnamed: 0       int64
budget           int64
homepage        object
id               int64
imdb_id         object
revenue          int64
runtime          int64
language        object
description     string
popularity     float64
title           object
rating         float64
votes            int64
year             int64
genre1          object
genre2          object
genre3          object
genre4          object
genre5          object
genre6          object
genre7          object
genre8          object
genres          object
dtype: object

#### Preproccesing the data

##### lowercase

In [8]:
tf_data.description

0       Set more than a decade after the events of the...
1       When a team of mercenaries breaks into a wealt...
2       Siblings Lucy, Edmund, Susan and Peter step th...
3       In the 22nd century, a paraplegic Marine is di...
4       Deep inside the mountain of Dovre, something g...
                              ...                        
9867    Louisa May Alcott's autobiographical account o...
9868    After a serial killer strangles several women ...
9869    At the dawn of the space-race, two radio-obses...
9870    In the town of Dillford, humans, vampires and ...
9871    A mysterious attacker has appeared and is assa...
Name: description, Length: 9872, dtype: string

In [9]:
tf_data['description'] = tf_data['description'].str.lower()
tf_data.description

0       set more than a decade after the events of the...
1       when a team of mercenaries breaks into a wealt...
2       siblings lucy, edmund, susan and peter step th...
3       in the 22nd century, a paraplegic marine is di...
4       deep inside the mountain of dovre, something g...
                              ...                        
9867    louisa may alcott's autobiographical account o...
9868    after a serial killer strangles several women ...
9869    at the dawn of the space-race, two radio-obses...
9870    in the town of dillford, humans, vampires and ...
9871    a mysterious attacker has appeared and is assa...
Name: description, Length: 9872, dtype: string

##### stopwords

In [10]:
#for the libraries
#https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [11]:
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [12]:
stop_words = set(stopwords.words('english'))

In [13]:
#https://stackoverflow.com/questions/29523254/python-remove-stop-words-from-pandas-dataframe
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
tf_data.description

0       set decade events first film, learn story sull...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy, edmund, susan peter step magica...
3       22nd century, paraplegic marine dispatched moo...
4       deep inside mountain dovre, something gigantic...
                              ...                        
9867    louisa may alcott's autobiographical account l...
9868    serial killer strangles several women necktie,...
9869    dawn space-race, two radio-obsessed teens disc...
9870    town dillford, humans, vampires zombies living...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

##### punctuation

In [14]:
#το κομμα να το βγάλω?

In [15]:
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n—”“–"
for i in symbols:
    tf_data.description = tf_data.description.str.replace(i," ")
tf_data.description

  tf_data.description = tf_data.description.str.replace(i," ")


0       set decade events first film, learn story sull...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy, edmund, susan peter step magica...
3       22nd century, paraplegic marine dispatched moo...
4       deep inside mountain dovre, something gigantic...
                              ...                        
9867    louisa may alcott's autobiographical account l...
9868    serial killer strangles several women necktie,...
9869    dawn space race, two radio obsessed teens disc...
9870    town dillford, humans, vampires zombies living...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

In [16]:
tf_data.to_csv('tf_data.csv')

##### apostrophe

In [17]:
tf_data.description = tf_data.description.str.replace("’"," ")
tf_data.description = tf_data.description.str.replace("‘"," ")
tf_data.description = tf_data.description.str.replace("'"," ")
tf_data.description = tf_data.description.str.replace(",","")
tf_data.description

0       set decade events first film learn story sully...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy edmund susan peter step magical ...
3       22nd century paraplegic marine dispatched moon...
4       deep inside mountain dovre something gigantic ...
                              ...                        
9867    louisa may alcott s autobiographical account l...
9868    serial killer strangles several women necktie ...
9869    dawn space race two radio obsessed teens disco...
9870    town dillford humans vampires zombies living p...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

##### stopwords again 

In [18]:
stop_words = set(stopwords.words('english'))
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
tf_data.description

0       set decade events first film learn story sully...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy edmund susan peter step magical ...
3       22nd century paraplegic marine dispatched moon...
4       deep inside mountain dovre something gigantic ...
                              ...                        
9867    louisa may alcott autobiographical account lif...
9868    serial killer strangles several women necktie ...
9869    dawn space race two radio obsessed teens disco...
9870    town dillford humans vampires zombies living p...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

##### single characters

In [19]:
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([word for word in x.split() if len(word) >1]))
tf_data.description

0       set decade events first film learn story sully...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy edmund susan peter step magical ...
3       22nd century paraplegic marine dispatched moon...
4       deep inside mountain dovre something gigantic ...
                              ...                        
9867    louisa may alcott autobiographical account lif...
9868    serial killer strangles several women necktie ...
9869    dawn space race two radio obsessed teens disco...
9870    town dillford humans vampires zombies living p...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

##### convert numbers

In [20]:
import num2words

In [21]:
#https://stackoverflow.com/questions/56733012/converting-number-in-sentences-to-word-in-python 
#is the for exaple 11th etc a problem?
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([num2words.num2words(i) if i.isdigit() else i for i in x.split()]))
tf_data['description'] 

0       set decade events first film learn story sully...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy edmund susan peter step magical ...
3       22nd century paraplegic marine dispatched moon...
4       deep inside mountain dovre something gigantic ...
                              ...                        
9867    louisa may alcott autobiographical account lif...
9868    serial killer strangles several women necktie ...
9869    dawn space race two radio obsessed teens disco...
9870    town dillford humans vampires zombies living p...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

##### punctuation and stopwords again

In [22]:
symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n—”“–"
for i in symbols:
    tf_data.description = tf_data.description.str.replace(i," ")
tf_data.description

  tf_data.description = tf_data.description.str.replace(i," ")


0       set decade events first film learn story sully...
1       team mercenaries breaks wealthy family compoun...
2       siblings lucy edmund susan peter step magical ...
3       22nd century paraplegic marine dispatched moon...
4       deep inside mountain dovre something gigantic ...
                              ...                        
9867    louisa may alcott autobiographical account lif...
9868    serial killer strangles several women necktie ...
9869    dawn space race two radio obsessed teens disco...
9870    town dillford humans vampires zombies living p...
9871    mysterious attacker appeared assaulting people...
Name: description, Length: 9872, dtype: object

In [23]:
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

##### stemming

In [24]:
from nltk.stem import PorterStemmer

In [25]:
##with Porter Stemmer
#https://stackoverflow.com/questions/37443138/python-stemming-with-pandas-dataframe
#https://www.projectpro.io/recipes/use-porter-stemmer
ps = PorterStemmer()
tf_data['description'] = tf_data['description'].apply(lambda x: ' '.join([ps.stem(y) for y in x.split()]))
tf_data.description

0       set decad event first film learn stori sulli f...
1       team mercenari break wealthi famili compound c...
2       sibl luci edmund susan peter step magic wardro...
3       22nd centuri parapleg marin dispatch moon pand...
4       deep insid mountain dovr someth gigant awaken ...
                              ...                        
9867    louisa may alcott autobiograph account life th...
9868    serial killer strangl sever women neckti londo...
9869    dawn space race two radio obsess teen discov s...
9870    town dillford human vampir zombi live peac ali...
9871    mysteri attack appear assault peopl whose name...
Name: description, Length: 9872, dtype: object

#### Apply TF-IDF to extract the features

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [27]:
#?TfidfVectorizer
#?train_test_split

##### apply the algorithm of tf-idf

In [28]:
#https://medium.com/@kunalgupta4595/predicting-movie-genres-based-on-plot-summaries-bae646e70e04
X= tf_data['description']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   test_size = 0.3,
                                   random_state = 42)

tfidf = TfidfVectorizer() #ngram_range=(2,3)
X_train_tfidf = tfidf.fit_transform(X_train) 
X_test_tfidf = tfidf.transform(X_test)
labels = tf_data.genres
X_train_tfidf.shape

(6910, 16991)

In [29]:
#each of 4958 descriptions is represented by 13711 features, 
#representing the tf-idf score for different unigrams and bigrams.

In [30]:
#https://towardsdatascience.com/multi-class-text-classification-with-scikit-learn-12f1e60e0a9f
#edo isos na kano auto me genres se numbers alla ola mazi se ena? ksexorista kai meta ola mazi?

#from sklearn.feature_selection import chi2
#import numpy as np
#N = 2
#for description, genres in sorted(tf_data.genres.items()):
#  features_chi2 = chi2(X_train_tfidf, labels == genres)
# indices = np.argsort(features_chi2[0])
# feature_names = np.array(tfidf.get_feature_names())[indices]
# bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
# trigrams = [v for v in feature_names if len(v.split(' ')) == 3]
# print("# '{}':".format(description))
# print("  . Most correlated unigrams:\n. {}".format('\n. '.join(bigrams[-N:])))
# print("  . Most correlated bigrams:\n. {}".format('\n. '.join(trigrams[-N:])))

##### apply MLP

In [31]:
from sklearn.neural_network import MLPClassifier

In [32]:
#https://slogix.in/source-code/python/deep-learning-samples/how-to-build-spam-detector-using-multi-layer-perceptron-in-python/
clf = MLPClassifier(activation='relu',
                    solver='adam',
                    max_iter=200,
                    hidden_layer_sizes=100,
                    random_state=42,
                    learning_rate='constant',
                    learning_rate_init=0.001)

In [34]:
#fit the model
clf.fit(X_train_tfidf, y_train)
# Predicting the Test set results
y_pred = clf.predict(X_test_tfidf)
y_pred[3]



array([1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 0, 0])

##### inverse_transform( ) function along with the MultiLabelBinarizer( ) object to convert the predicted arrays into movie genre tags

In [40]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred, average="micro")

0.7407368737030318

In [None]:
#We get a decent F1 score of 0.7407368737030318. 
#These predictions were made based on a threshold value of 0.5, 
#which means that the probabilities greater than or equal to 0.5 were converted to 1’s and the rest to 0’s.

In [None]:
#Let’s try to change this threshold value and see if that improves our model’s score:

In [41]:
# predict probabilities
y_pred_prob = clf.predict_proba(X_test_tfidf)

In [42]:
t = 0.3 # threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
f1_score(y_test, y_pred_new, average="micro")

0.7476846565679632

In [43]:
from sklearn import metrics
print("Classification report\n",metrics.classification_report(y_test, y_pred))
print("Accuracy of the model : ",metrics.accuracy_score(y_test, y_pred)*100)

Classification report
               precision    recall  f1-score   support

           0       0.86      0.91      0.89      2492
           1       0.86      0.91      0.89      2492
           2       0.73      0.66      0.69      1239
           3       0.63      0.60      0.61      1232
           4       0.60      0.56      0.58      1255
           5       0.70      0.62      0.65       967
           6       0.57      0.43      0.49       544
           7       0.41      0.24      0.30       394
           8       0.59      0.45      0.51       470
           9       0.70      0.45      0.55       368
          10       0.48      0.45      0.47       788
          11       0.25      0.02      0.03        60
          12       0.78      0.36      0.49       131
          13       0.78      0.80      0.79      2039
          14       0.66      0.68      0.67      1516
          15       0.64      0.59      0.62      1313
          16       0.86      0.92      0.89      2504
    

  _warn_prf(average, modifier, msg_start, len(result))
