# Data pipelines
Here I design and test the functions that receive raw data and convert it to its final form (ready to be fed into the final model)

In [None]:
# import the libraries needed
import pandas as pd
import numpy as np
from datetime import datetime
import re
from sklearn.model_selection import train_test_split

In [None]:
# Convert the raw dataset into a preprocessed .csv
def raw_to_processed(raw_movies, raw_ratings):
    pro_movies = raw_movies.copy()

    # Extract the Year from the Title of the Movie (if its between parenthesis)
    pro_movies['Year'] = pro_movies['title'].apply(
        lambda x: int(x.split("(")[-1][:4].replace(")", "").strip()) # if there are 2 years (like 2006-2010), the first year is taken
            if "(" in x else np.nan)    # if theres a ( in the Name, set the year, else, a NA
            
    pro_movies['title'] = pro_movies['title'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", x).strip())

    # copy raw ratings data:
    pro_ratings = raw_ratings.copy()

    # Add date column
    pro_ratings['Date'] = pro_ratings['timestamp'].apply(lambda x: datetime.fromtimestamp(x))
    
    # One-hot encoding of genres: 

    # First get a list of lists (each list is the list of genres for each movie)
    aux = [i.split('|') for i in pro_movies.genres.unique()]
    # Then create a set (unique array of elements) and remove the no genres listed
    vocab = list(set(i for k in aux for i in k))
    vocab.remove('(no genres listed)')
    print("Genres present in the dataset: ", vocab)

    # Now, create a column for each genre:
    for genre in vocab:
        pro_movies[genre] = pro_movies.genres.apply(lambda x: 1 if genre in x else 0)

    pro_ratings.to_csv("../data/processed/pro_ratings.csv", encoding='utf-8')
    pro_movies.to_csv("../data/processed/pro_movies.csv", encoding='utf-8')


In [None]:
# function that returns a tf dataset ready to be fed to the model
def processed_to_final(pro_ratings, pro_movies):
    df_final_beta = pro_ratings.copy()

    df_final_beta = df_final_beta.join(pro_movies.set_index('movieId'), on='movieId')
    df_final_beta.drop('genres', axis=1)

    train_df, test_df = train_test_split(df_final_beta, test_size=0.3)
    train_df.to_csv("../data/final/train.csv")
    test_df.to_csv("../data/final/test.csv")