diff --git a/app/.gitignore b/app/.gitignore index 70c2045..58afa5a 100644 --- a/app/.gitignore +++ b/app/.gitignore @@ -11,7 +11,7 @@ node_modules dist dist-ssr *.local -.pnpm-store +.pnpm # Editor directories and files .vscode/* diff --git a/app/package.json b/app/package.json index 27021bd..8cbe531 100644 --- a/app/package.json +++ b/app/package.json @@ -11,6 +11,7 @@ "storybook": "storybook dev --port 6006 --host 0.0.0.0", "build-storybook": "storybook build" }, + "packageManager": "pnpm@8.6.10", "dependencies": { "@phosphor-icons/vue": "2.0.1", "axios": "^1.4.0", diff --git a/app/src/ApiClientProvider.vue b/app/src/ApiClientProvider.vue index ea9eb1e..98fb7a0 100644 --- a/app/src/ApiClientProvider.vue +++ b/app/src/ApiClientProvider.vue @@ -13,7 +13,7 @@ import { RouteName } from './router'; const router = useRouter(); const apiClient = await useApiClient().apiClient; -(await apiClient) +apiClient .on('unauthorized', () => { if (router.currentRoute.value.name !== 'login') { router.push({ name: RouteName.LOGIN }); diff --git a/docker-compose.test.yml b/docker-compose.test.yml deleted file mode 100644 index 15c662e..0000000 --- a/docker-compose.test.yml +++ /dev/null @@ -1,43 +0,0 @@ -services: - - db: - image: postgres:15.1-alpine3.17 - environment: - POSTGRES_USER: "root" - POSTGRES_PASSWORD: "root" - POSTGRES_DB: "movie_match" - ports: - - "5432:5432" - volumes: - - "db_data:/var/lib/postgresql/data" - networks: - - db - - server: - build: - context: "./server" - volumes: - - "./server/config.development.yaml:/opt/movie-match/config.yaml:ro" - - "server_data_posters:/opt/movie-match/posters" - ports: - - "6445:6445" - depends_on: - - db - networks: - - db - - app: - build: - context: "./app" - environment: - MOVIEMATCH_API_SERVER_BASE_URL: "http://192.168.127.22:6445/" - ports: - - "8080:80" - -networks: - db: - -volumes: - db_data: - server_data_posters: - diff --git a/docker-compose.yml b/docker-compose.yml index 8d5f072..e2355dc 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -18,8 +18,8 @@ services: dockerfile: "docker/dev/api/Dockerfile" volumes: - "./server:/app:ro" + - "./server/data/posters:/var/lib/movie-match/posters" - "api_go_data:/go" - - "api_poster_data:/var/lib/movie-match/posters" ports: - "6445:6445" depends_on: @@ -42,4 +42,3 @@ networks: volumes: db_data: api_go_data: - api_poster_data: diff --git a/docker/dev/app/Dockerfile b/docker/dev/app/Dockerfile index fe7c9e0..d581cda 100644 --- a/docker/dev/app/Dockerfile +++ b/docker/dev/app/Dockerfile @@ -1,11 +1,12 @@ -FROM node:18.16.0-alpine3.17 +FROM node:18.16.0-alpine3.17 AS base -RUN npm install --location=global pnpm@8.5.1 +ENV PNPM_HOME="/app/.pnpm" +ENV PATH="$PNPM_HOME:$PATH" + +RUN corepack enable COPY docker/dev/app/wrapper.sh /wrapper.sh WORKDIR /app -USER node - CMD [ "/wrapper.sh" ] diff --git a/docker/dev/app/wrapper.sh b/docker/dev/app/wrapper.sh index 1b109a9..d73e18e 100755 --- a/docker/dev/app/wrapper.sh +++ b/docker/dev/app/wrapper.sh @@ -1,3 +1,3 @@ #!/bin/sh -echo 'y' | pnpm install --frozen-lockfile && exec pnpm dev +pnpm install --frozen-lockfile && exec pnpm dev diff --git a/recommender/prep/all2json.ts b/recommender/prep/all2json.ts index 5a21337..d99d61a 100644 --- a/recommender/prep/all2json.ts +++ b/recommender/prep/all2json.ts @@ -9,8 +9,8 @@ const enum MediaType { const run = async () => { const client = new Client({ host: 'localhost', - user: 'pgres', - password: 'pgres', + user: 'root', + password: 'root', database: 'movie_match', }); diff --git a/recommender/prep/package.json b/recommender/prep/package.json index e294049..d2b447b 100644 --- a/recommender/prep/package.json +++ b/recommender/prep/package.json @@ -4,7 +4,9 @@ "description": "Prep scripts for recommender", "scripts": { "voted": "ts-node vote2json.ts", - "all": "ts-node all2json.ts" + "all": "ts-node all2json.ts", + "full": "ts-node vote2json.ts && ts-node all2json.ts", + "wordcount": "ts-node wordcount.ts" }, "keywords": [], "author": "", diff --git a/recommender/prep/vote2json.ts b/recommender/prep/vote2json.ts index 71b30cd..5bf87e3 100644 --- a/recommender/prep/vote2json.ts +++ b/recommender/prep/vote2json.ts @@ -21,8 +21,8 @@ const enum MediaType { const run = async () => { const client = new Client({ host: 'localhost', - user: 'pgres', - password: 'pgres', + user: 'root', + password: 'root', database: 'movie_match', }); diff --git a/recommender/scripts/predict_media_votes.py b/recommender/scripts/predict_media_votes.py new file mode 100644 index 0000000..b5a245e --- /dev/null +++ b/recommender/scripts/predict_media_votes.py @@ -0,0 +1,45 @@ +import pandas as pd +from sklearn.neural_network import MLPRegressor +from sklearn.pipeline import make_pipeline +from sklearn.preprocessing import StandardScaler, MinMaxScaler + +from scripts.processing import process_media_flat + +print("processing media ...") + +media_voted_raw = pd.read_json('../data/media_all_voted.json') +media_voted = process_media_flat(media_voted_raw) + +pipe = make_pipeline( + StandardScaler(), + MLPRegressor( + max_iter=800, + verbose=True, + random_state=42, + learning_rate_init=0.0001 + ), +) + +x = media_voted.drop(columns=['voteType']) +y = media_voted['voteType'].astype(float) + +pipe.fit(x, y) + +# vote all media + +media_all_raw = pd.read_json('../data/media_all.json') +media_all = process_media_flat(media_all_raw) + +pred_all = pipe.predict(media_all) + +scaler = MinMaxScaler(feature_range=(-1, 1)) +scaled_pred = scaler.fit_transform(pred_all.reshape(-1, 1)).flatten() + +predicted_media = media_all_raw.copy() + +predicted_media['voteType'] = scaled_pred +predicted_media['mediaId'] = media_all_raw['mediaId'] + + +# predicted_media = predicted_media.loc[~predicted_media['mediaId'].isin(media_voted_raw['mediaId'])] +predicted_media = predicted_media.filter(regex='^(?!.*_y$)') diff --git a/recommender/scripts/predict_media_votes_tf.py b/recommender/scripts/predict_media_votes_tf.py new file mode 100644 index 0000000..fe8e6e6 --- /dev/null +++ b/recommender/scripts/predict_media_votes_tf.py @@ -0,0 +1,142 @@ +# this does not work at all + +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from keras_preprocessing.sequence import pad_sequences +from keras_preprocessing.text import Tokenizer +from tensorflow.python.keras import Input +from tensorflow.python.keras.layers import Embedding, Flatten, Concatenate, Dense +from tensorflow.python.keras.models import Model + +from scripts.processing import sanitize_text + +print("processing media ...") + +media_voted_raw = pd.read_json('../data/media_all_voted.json') + +media = sanitize_text(media_voted_raw) + +mediaType_input = Input(shape=(1,)) +mediaRating_input = Input(shape=(1,)) + +genre_input = Input(shape=(4,)) + +summary_input = Input(shape=(100,)) +title_input = Input(shape=(5,)) + +num_genres = 27 +max_words = 1000 + +embedding_dim = 8 + +genre_emb = Embedding(input_dim=num_genres * 4, output_dim=embedding_dim)(genre_input) + +summary_emb = Embedding(input_dim=max_words, output_dim=embedding_dim)(summary_input) +title_emb = Embedding(input_dim=max_words, output_dim=embedding_dim)(title_input) + +genre_flat = Flatten()(genre_emb) +summary_flat = Flatten()(summary_emb) +title_flat = Flatten()(title_emb) + +mediaType_data = np.array(media['mediaType']) + +genre0_data = np.array(media['genre0']) +genre1_data = np.array(media['genre1']) +genre2_data = np.array(media['genre2']) +genre3_data = np.array(media['genre3']) + +mediaRating_data = np.array(media['mediaRating']) + +summary_data = np.array(media['mediaSummary']) +title_data = np.array(media['mediaTitle']) + +target = np.array(media['voteType']).astype(float) # The target values between -1 and 1 + +# tokenize + +tokenizer_general = Tokenizer(num_words=max_words, oov_token='UNK') +tokenizer_general.fit_on_texts(np.concatenate([summary_data, title_data])) + +summary_seq = tokenizer_general.texts_to_sequences(summary_data) +title_seq = tokenizer_general.texts_to_sequences(title_data) + +tokenizer_genres = Tokenizer(num_words=num_genres, oov_token='UNK') +tokenizer_genres.fit_on_texts(np.concatenate([genre0_data, genre1_data, genre2_data, genre3_data])) + +genre0_seq = tokenizer_genres.texts_to_sequences(genre0_data) +genre1_seq = tokenizer_genres.texts_to_sequences(genre1_data) +genre2_seq = tokenizer_genres.texts_to_sequences(genre2_data) +genre3_seq = tokenizer_genres.texts_to_sequences(genre3_data) + +summary_data_padded = pad_sequences(summary_seq, maxlen=100, padding='post') +title_data_padded = pad_sequences(title_seq, maxlen=5, padding='post') + +genre0_data_padded = pad_sequences(genre0_seq, maxlen=1, padding='post') +genre1_data_padded = pad_sequences(genre1_seq, maxlen=1, padding='post') +genre2_data_padded = pad_sequences(genre2_seq, maxlen=1, padding='post') +genre3_data_padded = pad_sequences(genre3_seq, maxlen=1, padding='post') + +genre_data_combined = np.column_stack([genre0_data_padded, genre1_data_padded, genre2_data_padded, genre3_data_padded]) + +concatenated = Concatenate()([ + mediaType_input, + genre_flat, + mediaRating_input, + summary_flat, + title_flat +]) + +# Define the dense neural network +dense_layer_1 = Dense(128, activation='relu')(concatenated) +dense_layer_2 = Dense(64, activation='relu')(dense_layer_1) +output = Dense(1, activation='sigmoid')(dense_layer_2) # Sigmoid activation for output between -1 and 1 + +# Define the model with all inputs and the output +model = Model( + inputs=[ + mediaType_input, + genre_input, + mediaRating_input, + summary_input, + title_input + ], + outputs=output +) + +# Compile the model +model.compile(optimizer='adam', loss='mean_squared_error') + +# Print the summary of the model +model.summary() + +print("mediaType_data shape:", mediaType_data.shape) +print("genre_data_combined shape:", genre_data_combined.shape) +print("mediaRating_data shape:", mediaRating_data.shape) +print("summary_data_padded shape:", summary_data_padded.shape) +print("title_data_padded shape:", title_data_padded.shape) +print("target shape:", target.shape) + +input_data = [ + mediaType_data, + genre_data_combined, + mediaRating_data, + summary_data_padded, + title_data_padded +] + +# Train the model +history = model.fit( + x=input_data, + y=target, + validation_data=(input_data, target), + epochs=50, + steps_per_epoch=3000, + validation_steps=3000 +) + +plt.plot(history.history['acc'], label='Training Loss') +plt.xlabel('Epoch') +plt.ylabel('Loss') +plt.legend() +plt.show() diff --git a/recommender/scripts/recommend_media.py b/recommender/scripts/processing.py similarity index 65% rename from recommender/scripts/recommend_media.py rename to recommender/scripts/processing.py index 92fe885..2583911 100644 --- a/recommender/scripts/recommend_media.py +++ b/recommender/scripts/processing.py @@ -1,24 +1,27 @@ import re import string -import pandas as pd from keras_preprocessing.sequence import pad_sequences from keras_preprocessing.text import Tokenizer from nltk.corpus import stopwords -from sklearn.naive_bayes import GaussianNB -def process_media(media): - # remove unneeded columns - # not sure how to process mediaReleaseDate for now - maybe use unix seconds since 1970 +def sanitize_text(media): + # fill empty values - media = media.drop(columns=['mediaId', 'mediaReleaseDate']) + media = media.fillna('') # remove punctuation media['mediaSummary'] = media['mediaSummary'].apply(lambda x: re.sub('[^\w\s]', '', x)) + media['mediaTitle'] = media['mediaTitle'].apply(lambda x: re.sub('[^\w\s]', '', x)) - # remove stopwords + media['genre0'] = media['genre0'].apply(lambda x: re.sub('\s', '', x)) + media['genre1'] = media['genre1'].apply(lambda x: re.sub('\s', '', x)) + media['genre2'] = media['genre2'].apply(lambda x: re.sub('\s', '', x)) + media['genre3'] = media['genre3'].apply(lambda x: re.sub('\s', '', x)) + + # remove stop words stops = stopwords.words('german') @@ -32,9 +35,18 @@ def process_media(media): word not in stops]) ) - # fill null with empty strings + return media + - media.fillna('', inplace=True) +def process_media_flat(media): + # remove unneeded columns + # not sure how to process mediaReleaseDate for now - maybe use unix seconds since 1970 + + media = media.drop(columns=['mediaId', 'mediaReleaseDate']) + + # remove stopwords + + media = sanitize_text(media) # fit tokenizer for genres @@ -67,7 +79,7 @@ def process_media(media): # map prosa to tokens - summary_word_count = 20 + summary_word_count = 60 summary_seqs = pad_sequences(prosa_tokenizer.texts_to_sequences(media['mediaSummary']), maxlen=summary_word_count, padding='post') @@ -84,42 +96,3 @@ def process_media(media): media[f'title{i}'] = title_seqs[:, i] return media.drop(columns=['mediaSummary', 'mediaTitle']) - - -def main(): - print("hello world!") - - media_voted_raw = pd.read_json('../data/media_all_voted.json') - media_voted = process_media(media_voted_raw) - - # x_train, x_test, y_train, y_test = train_test_split(media, media['voteType'], test_size=.25, random_state=0) - # cm = confusion_matrix(y_test, y_pred) - - classifier = GaussianNB() - - x = media_voted.drop(columns=['voteType']) - y = media_voted['voteType'] - - classifier.fit(x, y) - - # vote all media - - media_all_raw = pd.read_json('../data/media_all.json') - media_all = process_media(media_all_raw) - - pred_all = classifier.predict(media_all) - - media_all_raw['voteType'] = pred_all - - recommended_media = media_all_raw[media_all_raw['voteType'] == 1] - - merged_df = recommended_media.merge(media_voted_raw, on='mediaId', how='left', indicator=True) - - recommended_media = merged_df[merged_df['_merge'] == 'left_only'] - recommended_media.drop(columns=['_merge'], inplace=True) - - media_voted.head() - - -if __name__ == "__main__": - main() diff --git a/recommender/scripts/wordcount.py b/recommender/scripts/wordcount.py new file mode 100644 index 0000000..f4c12cb --- /dev/null +++ b/recommender/scripts/wordcount.py @@ -0,0 +1,25 @@ +import matplotlib.pyplot as plt +import pandas as pd + +from scripts.processing import sanitize_text + +media_raw = pd.read_json('../data/media_all.json') + +media = sanitize_text(media_raw) + +media['title_word_count'] = media['mediaTitle'].apply( + lambda x: len(x.split(sep=' ')) +) + +media['summary_word_count'] = media['mediaSummary'].apply( + lambda x: len(x.split(sep=' ')) +) + +plt.figure() +media.boxplot(column='title_word_count', whis=[5, 95]) +plt.show() + +plt.figure() +media.boxplot(column='summary_word_count', whis=[5, 95]) +plt.show() + diff --git a/server/.gitignore b/server/.gitignore index 680b980..6ce0993 100644 --- a/server/.gitignore +++ b/server/.gitignore @@ -1,3 +1,6 @@ .idea/ config*.yml config*.yaml + +data/**/* +!data/.gitkeep diff --git a/server/data/.gitkeep b/server/data/.gitkeep new file mode 100644 index 0000000..e69de29