In [2]:
import re

import numpy as np
import pandas as pd
from keras.utils import pad_sequences
from keras_preprocessing.text import Tokenizer
from nltk.corpus import stopwords
import string
import tensorflow as tf
from sklearn.model_selection import train_test_split

media = pd.read_json('../data/media-200.json', lines=True)

# transpose genres into columns

def extract_names(row):
    if row is None:
        return []
    return sorted([obj['name'].lower() for obj in row])

max_names = media['genres'].apply(lambda x: len(extract_names(x))).max()
column_names = [f'genre_{i+1}' for i in range(max_names)]

media[column_names] = pd.DataFrame(media['genres'].apply(lambda x: extract_names(x)).tolist())

media[['summary', 'title']] = media[['summary', 'title']].apply(lambda x: x.apply(lambda y: y.lower()))

media = media.drop(columns=['id', 'genres', 'createdAt', 'updatedAt'])

# remove punctuation

media['summary'] = media['summary'].apply(lambda x: re.sub('[^\w\s]','', x))

# remove stopwords

stops = stopwords.words('german')

media['summary'] = media['summary'].apply(
    lambda x: ' '.join([word for word in x.lower().translate(x.maketrans('', '', string.punctuation)).split() if word not in stops])
)

media.fillna('', inplace=True)

media.head(n=200)

2023-05-28 01:18:45.250665: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Unnamed: 0,foreignId,type,provider,title,summary,runtime,rating,releaseDate,genre_1,genre_2,genre_3,genre_4
0,773655,movie,tmdb,death to 2020,macher black mirror blicken hochkarätigen star...,71,65,2020-12-27T00:00:00Z,komödie,,,
1,598331,movie,tmdb,monster-liga,stadt stoker beliebten lokalmatador monsterwre...,95,69,2021-12-15T00:00:00Z,animation,familie,,
2,598268,movie,tmdb,the whaler boy,leshka 15jähriger inuitwaljäger sibirischen pa...,93,61,2020-10-08T00:00:00Z,drama,,,
3,655,movie,tmdb,"paris, texas",vier jahre verschwinden taucht tot geglaubte t...,138,81,1984-08-23T00:00:00Z,drama,,,
4,657,movie,tmdb,james bond 007 - liebesgrüße aus moskau,james bond istanbul russische dechiffriermasch...,115,71,1963-10-10T00:00:00Z,abenteuer,action,thriller,
...,...,...,...,...,...,...,...,...,...,...,...,...
195,45,tv,tmdb,top gear,spektakulärsten autos welt testfahrten vulkan ...,60,74,2002-10-20T00:00:00Z,dokumentarfilm,komödie,talk,
196,86878,tv,tmdb,klinik am südring - youtube,klinik südring bieten echte ärzte schwestern e...,0,0,2017-11-01T00:00:00Z,drama,,,
197,197032,tv,tmdb,good times island,zweiteiliges penpaper deutschen senders rocket...,118,0,2017-09-23T00:00:00Z,komödie,mystery,,
198,86943,tv,tmdb,girls from ipanema,hausfrau 1950er jahren geht rio de janeiro man...,60,69,2019-03-22T00:00:00Z,drama,,,


In [3]:
# add vote to movies
# vote all "historie" movies "positive"

#  1 = positive
#  0 = neutral
# -1 = negative

def vote(row):
    for i in range(max_names):
        if row[f'genre_{i+1}'] == 'historie':
            return 1

    return 0

media['vote'] = media.apply(lambda x: vote(x), axis=1)

media.head(n=100)

Unnamed: 0,foreignId,type,provider,title,summary,runtime,rating,releaseDate,genre_1,genre_2,genre_3,genre_4,vote
0,773655,movie,tmdb,death to 2020,macher black mirror blicken hochkarätigen star...,71,65,2020-12-27T00:00:00Z,komödie,,,,0
1,598331,movie,tmdb,monster-liga,stadt stoker beliebten lokalmatador monsterwre...,95,69,2021-12-15T00:00:00Z,animation,familie,,,0
2,598268,movie,tmdb,the whaler boy,leshka 15jähriger inuitwaljäger sibirischen pa...,93,61,2020-10-08T00:00:00Z,drama,,,,0
3,655,movie,tmdb,"paris, texas",vier jahre verschwinden taucht tot geglaubte t...,138,81,1984-08-23T00:00:00Z,drama,,,,0
4,657,movie,tmdb,james bond 007 - liebesgrüße aus moskau,james bond istanbul russische dechiffriermasch...,115,71,1963-10-10T00:00:00Z,abenteuer,action,thriller,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,425199,movie,tmdb,la voleuse de saint-lubin,alleinerziehende mutter zweier kinder stiehlt ...,78,0,2001-11-09T00:00:00Z,drama,,,,0
96,425202,movie,tmdb,verdammt in alle eitelkeit,jüdin theresienstadt überlebte ganzes wiedergu...,80,0,2000-02-19T00:00:00Z,drama,komödie,,,0
97,425206,movie,tmdb,vor der hochzeit,dokumentarfilm 23jährige türkin berlin eltern ...,87,0,1997-02-15T00:00:00Z,dokumentarfilm,,,,0
98,425209,movie,tmdb,playmobil: römer & ägypter,ersten mal junge legionär quintus römischen fl...,13,0,2017-01-09T00:00:00Z,animation,,,,0


In [4]:
genre_tok = Tokenizer(oov_token='<UnknownGenre>')

for i in range(max_names):
    genre_tok.fit_on_texts(media[f'genre_{i+1}'])

print(genre_tok.word_index)

{'<UnknownGenre>': 1, 'drama': 2, 'komödie': 3, 'action': 4, 'thriller': 5, 'abenteuer': 6, 'fantasy': 7, 'animation': 8, 'krimi': 9, 'dokumentarfilm': 10, 'adventure': 11, 'mystery': 12, 'sci': 13, 'fi': 14, 'familie': 15, 'liebesfilm': 16, 'science': 17, 'fiction': 18, 'reality': 19, 'horror': 20, 'talk': 21, 'historie': 22, 'musik': 23, 'kids': 24, 'kriegsfilm': 25, 'western': 26, 'war': 27, 'politics': 28, 'news': 29, 'tv': 30, 'film': 31}


In [5]:
summary_tok = Tokenizer(oov_token='<OOV>')

summary_tok.fit_on_texts(media['summary'])

summary_total_words = len(summary_tok.word_index) + 1

print(summary_tok.word_index)


{'<OOV>': 1, 'leben': 2, 'welt': 3, 'dabei': 4, 'familie': 5, 'junge': 6, 'jahre': 7, 'frau': 8, 'immer': 9, 'beiden': 10, 'stadt': 11, 'jahr': 12, 'harry': 13, 'macht': 14, 'kommt': 15, 'mann': 16, 'eltern': 17, 'bond': 18, 'beginnt': 19, 'serie': 20, 'trifft': 21, 'james': 22, 'zwei': 23, 'wurde': 24, 'jungen': 25, 'schon': 26, 'ganz': 27, 'jedoch': 28, 'zusammen': 29, 'menschen': 30, 'suche': 31, 'führen': 32, 'deren': 33, 'zeit': 34, 'abenteuer': 35, 'liebe': 36, 'mädchen': 37, 'mehr': 38, 'zurück': 39, 'finden': 40, 'neuen': 41, 'hilfe': 42, 'drei': 43, 'gerade': 44, 'lange': 45, 'stellen': 46, 'new': 47, 'geschichte': 48, 'planeten': 49, 'versucht': 50, 'lebt': 51, 'vier': 52, 'bruder': 53, 'haus': 54, 'gemeinsam': 55, 'geht': 56, 'neue': 57, 'handelt': 58, 'bald': 59, 'kennen': 60, 'gelingt': 61, 'jahren': 62, 'erde': 63, 'frauen': 64, 'insel': 65, 'deutsche': 66, 'stellt': 67, 'vater': 68, 'treffen': 69, 'tod': 70, 'entführt': 71, 'judah': 72, 'nachdem': 73, 'lernt': 74, 'freun

In [10]:
x_train, x_test, y_train, y_test = train_test_split(media[['genre_1', 'genre_2']], media[['vote']])

print(y_test)

     vote
146     0
168     0
192     0
69      0
158     0
65      0
82      0
106     0
78      0
125     0
66      0
165     0
85      0
83      0
12      1
93      0
187     0
97      0
100     0
33      0
193     0
68      1
22      0
107     0
130     0
24      0
104     0
194     0
162     0
118     0
196     0
140     0
137     0
79      0
80      0
174     0
139     0
38      0
49      0
183     0
84      0
172     0
41      0
108     0
151     0
13      0
154     0
96      0
134     0
176     0
