Skip to content

Commit

Permalink
implement recommender in python
Browse files Browse the repository at this point in the history
  • Loading branch information
nitwhiz committed Jul 30, 2023
1 parent e02825f commit 76b887c
Show file tree
Hide file tree
Showing 16 changed files with 254 additions and 106 deletions.
2 changes: 1 addition & 1 deletion app/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ node_modules
dist
dist-ssr
*.local
.pnpm-store
.pnpm

# Editor directories and files
.vscode/*
Expand Down
1 change: 1 addition & 0 deletions app/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"storybook": "storybook dev --port 6006 --host 0.0.0.0",
"build-storybook": "storybook build"
},
"packageManager": "pnpm@8.6.10",
"dependencies": {
"@phosphor-icons/vue": "2.0.1",
"axios": "^1.4.0",
Expand Down
2 changes: 1 addition & 1 deletion app/src/ApiClientProvider.vue
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import { RouteName } from './router';
const router = useRouter();
const apiClient = await useApiClient().apiClient;
(await apiClient)
apiClient
.on('unauthorized', () => {
if (router.currentRoute.value.name !== 'login') {
router.push({ name: RouteName.LOGIN });
Expand Down
43 changes: 0 additions & 43 deletions docker-compose.test.yml

This file was deleted.

3 changes: 1 addition & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ services:
dockerfile: "docker/dev/api/Dockerfile"
volumes:
- "./server:/app:ro"
- "./server/data/posters:/var/lib/movie-match/posters"
- "api_go_data:/go"
- "api_poster_data:/var/lib/movie-match/posters"
ports:
- "6445:6445"
depends_on:
Expand All @@ -42,4 +42,3 @@ networks:
volumes:
db_data:
api_go_data:
api_poster_data:
9 changes: 5 additions & 4 deletions docker/dev/app/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
FROM node:18.16.0-alpine3.17
FROM node:18.16.0-alpine3.17 AS base

RUN npm install --location=global pnpm@8.5.1
ENV PNPM_HOME="/app/.pnpm"
ENV PATH="$PNPM_HOME:$PATH"

RUN corepack enable

COPY docker/dev/app/wrapper.sh /wrapper.sh

WORKDIR /app

USER node

CMD [ "/wrapper.sh" ]
2 changes: 1 addition & 1 deletion docker/dev/app/wrapper.sh
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
#!/bin/sh

echo 'y' | pnpm install --frozen-lockfile && exec pnpm dev
pnpm install --frozen-lockfile && exec pnpm dev
4 changes: 2 additions & 2 deletions recommender/prep/all2json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@ const enum MediaType {
const run = async () => {
const client = new Client({
host: 'localhost',
user: 'pgres',
password: 'pgres',
user: 'root',
password: 'root',
database: 'movie_match',
});

Expand Down
4 changes: 3 additions & 1 deletion recommender/prep/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@
"description": "Prep scripts for recommender",
"scripts": {
"voted": "ts-node vote2json.ts",
"all": "ts-node all2json.ts"
"all": "ts-node all2json.ts",
"full": "ts-node vote2json.ts && ts-node all2json.ts",
"wordcount": "ts-node wordcount.ts"
},
"keywords": [],
"author": "",
Expand Down
4 changes: 2 additions & 2 deletions recommender/prep/vote2json.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,8 @@ const enum MediaType {
const run = async () => {
const client = new Client({
host: 'localhost',
user: 'pgres',
password: 'pgres',
user: 'root',
password: 'root',
database: 'movie_match',
});

Expand Down
45 changes: 45 additions & 0 deletions recommender/scripts/predict_media_votes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import pandas as pd
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from scripts.processing import process_media_flat

print("processing media ...")

media_voted_raw = pd.read_json('../data/media_all_voted.json')
media_voted = process_media_flat(media_voted_raw)

pipe = make_pipeline(
StandardScaler(),
MLPRegressor(
max_iter=800,
verbose=True,
random_state=42,
learning_rate_init=0.0001
),
)

x = media_voted.drop(columns=['voteType'])
y = media_voted['voteType'].astype(float)

pipe.fit(x, y)

# vote all media

media_all_raw = pd.read_json('../data/media_all.json')
media_all = process_media_flat(media_all_raw)

pred_all = pipe.predict(media_all)

scaler = MinMaxScaler(feature_range=(-1, 1))
scaled_pred = scaler.fit_transform(pred_all.reshape(-1, 1)).flatten()

predicted_media = media_all_raw.copy()

predicted_media['voteType'] = scaled_pred
predicted_media['mediaId'] = media_all_raw['mediaId']


# predicted_media = predicted_media.loc[~predicted_media['mediaId'].isin(media_voted_raw['mediaId'])]
predicted_media = predicted_media.filter(regex='^(?!.*_y$)')
142 changes: 142 additions & 0 deletions recommender/scripts/predict_media_votes_tf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
# this does not work at all

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from keras_preprocessing.sequence import pad_sequences
from keras_preprocessing.text import Tokenizer
from tensorflow.python.keras import Input
from tensorflow.python.keras.layers import Embedding, Flatten, Concatenate, Dense
from tensorflow.python.keras.models import Model

from scripts.processing import sanitize_text

print("processing media ...")

media_voted_raw = pd.read_json('../data/media_all_voted.json')

media = sanitize_text(media_voted_raw)

mediaType_input = Input(shape=(1,))
mediaRating_input = Input(shape=(1,))

genre_input = Input(shape=(4,))

summary_input = Input(shape=(100,))
title_input = Input(shape=(5,))

num_genres = 27
max_words = 1000

embedding_dim = 8

genre_emb = Embedding(input_dim=num_genres * 4, output_dim=embedding_dim)(genre_input)

summary_emb = Embedding(input_dim=max_words, output_dim=embedding_dim)(summary_input)
title_emb = Embedding(input_dim=max_words, output_dim=embedding_dim)(title_input)

genre_flat = Flatten()(genre_emb)
summary_flat = Flatten()(summary_emb)
title_flat = Flatten()(title_emb)

mediaType_data = np.array(media['mediaType'])

genre0_data = np.array(media['genre0'])
genre1_data = np.array(media['genre1'])
genre2_data = np.array(media['genre2'])
genre3_data = np.array(media['genre3'])

mediaRating_data = np.array(media['mediaRating'])

summary_data = np.array(media['mediaSummary'])
title_data = np.array(media['mediaTitle'])

target = np.array(media['voteType']).astype(float) # The target values between -1 and 1

# tokenize

tokenizer_general = Tokenizer(num_words=max_words, oov_token='UNK')
tokenizer_general.fit_on_texts(np.concatenate([summary_data, title_data]))

summary_seq = tokenizer_general.texts_to_sequences(summary_data)
title_seq = tokenizer_general.texts_to_sequences(title_data)

tokenizer_genres = Tokenizer(num_words=num_genres, oov_token='UNK')
tokenizer_genres.fit_on_texts(np.concatenate([genre0_data, genre1_data, genre2_data, genre3_data]))

genre0_seq = tokenizer_genres.texts_to_sequences(genre0_data)
genre1_seq = tokenizer_genres.texts_to_sequences(genre1_data)
genre2_seq = tokenizer_genres.texts_to_sequences(genre2_data)
genre3_seq = tokenizer_genres.texts_to_sequences(genre3_data)

summary_data_padded = pad_sequences(summary_seq, maxlen=100, padding='post')
title_data_padded = pad_sequences(title_seq, maxlen=5, padding='post')

genre0_data_padded = pad_sequences(genre0_seq, maxlen=1, padding='post')
genre1_data_padded = pad_sequences(genre1_seq, maxlen=1, padding='post')
genre2_data_padded = pad_sequences(genre2_seq, maxlen=1, padding='post')
genre3_data_padded = pad_sequences(genre3_seq, maxlen=1, padding='post')

genre_data_combined = np.column_stack([genre0_data_padded, genre1_data_padded, genre2_data_padded, genre3_data_padded])

concatenated = Concatenate()([
mediaType_input,
genre_flat,
mediaRating_input,
summary_flat,
title_flat
])

# Define the dense neural network
dense_layer_1 = Dense(128, activation='relu')(concatenated)
dense_layer_2 = Dense(64, activation='relu')(dense_layer_1)
output = Dense(1, activation='sigmoid')(dense_layer_2) # Sigmoid activation for output between -1 and 1

# Define the model with all inputs and the output
model = Model(
inputs=[
mediaType_input,
genre_input,
mediaRating_input,
summary_input,
title_input
],
outputs=output
)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Print the summary of the model
model.summary()

print("mediaType_data shape:", mediaType_data.shape)
print("genre_data_combined shape:", genre_data_combined.shape)
print("mediaRating_data shape:", mediaRating_data.shape)
print("summary_data_padded shape:", summary_data_padded.shape)
print("title_data_padded shape:", title_data_padded.shape)
print("target shape:", target.shape)

input_data = [
mediaType_data,
genre_data_combined,
mediaRating_data,
summary_data_padded,
title_data_padded
]

# Train the model
history = model.fit(
x=input_data,
y=target,
validation_data=(input_data, target),
epochs=50,
steps_per_epoch=3000,
validation_steps=3000
)

plt.plot(history.history['acc'], label='Training Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()
Loading

0 comments on commit 76b887c

Please sign in to comment.