In [23]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import glob
from unsplit import unsplit, get_genre # selbstgeschreibene funktion aus /data_exp_prep/unsplit.py
from sklearn.preprocessing import LabelEncoder
import random as rd

In [34]:
links = pd.read_csv('../Data/links.csv')
movies = pd.read_csv('../Data/movies.csv')
ratings = unsplit('../Data/ratings')
tags = pd.read_csv('../Data/tags.csv')

# Ratings prep

In [7]:
ratings.head(5)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating,timestamp
0,22000000,137746,586,4.0,840219554
1,22000001,137746,587,5.0,840219554
2,22000002,137746,588,4.0,840219360
3,22000003,137746,589,5.0,840219511
4,22000004,137746,590,2.0,840219324


In [8]:
ratings.drop('timestamp', axis = 1, inplace =True) # timestamps werden nicht gebraucht

In [9]:
ratings.head(1)

Unnamed: 0.1,Unnamed: 0,userId,movieId,rating
0,22000000,137746,586,4.0


In [11]:
ratings.columns

Index(['Unnamed: 0', 'userId', 'movieId', 'rating'], dtype='object')

In [12]:
ratings.drop('Unnamed: 0', axis = 1, inplace = True) # index wird nur einmal gebraucht

In [13]:
ratings.head(1)

Unnamed: 0,userId,movieId,rating
0,137746,586,4.0


# Movies prep

In [39]:
movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [45]:
movies_copy = movies.copy()

In [46]:
movies_copy['genres_list'] = movies_copy['genres'].apply(lambda x: x.split('|'))

In [47]:
movies_copy

Unnamed: 0,movieId,title,genres,genres_list
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]"
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]"
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]"
4,5,Father of the Bride Part II (1995),Comedy,[Comedy]
...,...,...,...,...
87580,292731,The Monroy Affaire (2022),Drama,[Drama]
87581,292737,Shelter in Solitude (2023),Comedy|Drama,"[Comedy, Drama]"
87582,292753,Orca (2023),Drama,[Drama]
87583,292755,The Angry Breed (1968),Drama,[Drama]


In [48]:
movies_copy['genres_list'] = movies_copy['genres'].apply(lambda x: x.split('|'))
# join them back with '|' so we can use str.get_dummies
movies_copy['genres_str'] = movies_copy['genres_list'].apply(lambda x: '|'.join(x))

genre_dummies = movies_copy['genres_str'].str.get_dummies(sep='|')

In [50]:
movies_copy = movies_copy.join(genre_dummies)

In [51]:
movies_copy.head(5)

Unnamed: 0,movieId,title,genres,genres_list,genres_str,(no genres listed),Action,Adventure,Animation,Children,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Adventure|Children|Fantasy,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Comedy|Romance,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Comedy|Drama|Romance,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Comedy,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [53]:
cols_to_drop = ['genres', 'genres_list', 'genres_str']
movies_copy.drop(columns = cols_to_drop, inplace = True)
movies_copy.head(5)

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [63]:
movies_copy['(no genres listed)'] = movies_copy['(no genres listed)'].rename('no genres', inplace = True)

In [67]:
movies_copy

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,292731,The Monroy Affaire (2022),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87581,292737,Shelter in Solitude (2023),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
87582,292753,Orca (2023),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
87583,292755,The Angry Breed (1968),0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
movies_copy[['title_only','year']] = movies_copy['title'].str.extract(r'^(.*)\s\((\d{4})\)$')


In [69]:
movies_copy.head(20)

Unnamed: 0,movieId,title,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,title_only,year
0,1,Toy Story (1995),0,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,Toy Story,1995
1,2,Jumanji (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Jumanji,1995
2,3,Grumpier Old Men (1995),0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,Father of the Bride Part II,1995
5,6,Heat (1995),0,1,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,Heat,1995
6,7,Sabrina (1995),0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,Sabrina,1995
7,8,Tom and Huck (1995),0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Tom and Huck,1995
8,9,Sudden Death (1995),0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,Sudden Death,1995
9,10,GoldenEye (1995),0,1,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,GoldenEye,1995


In [71]:
movies_copy.drop('title', inplace = True, axis = 1)

In [73]:
movies_copy

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,title_only,year
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,Toy Story,1995
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Jumanji,1995
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men,1995
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,Waiting to Exhale,1995
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Father of the Bride Part II,1995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87580,292731,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,The Monroy Affaire,2022
87581,292737,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,Shelter in Solitude,2023
87582,292753,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,Orca,2023
87583,292755,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,The Angry Breed,1968


In [74]:
movies_copy['year'].unique()

array(['1995', '1994', '1996', '1976', '1992', '1988', '1967', '1993',
       '1964', '1977', '1965', '1982', '1990', '1991', '1989', '1937',
       '1940', '1969', '1981', '1973', '1970', '1960', '1955', '1959',
       '1968', '1980', '1975', '1986', '1948', '1943', '1950', '1946',
       '1987', '1997', '1974', '1956', '1958', '1949', '1972', '1998',
       '1933', '1952', '1951', '1957', '1961', '1954', '1934', '1944',
       '1963', '1942', '1941', '1953', '1939', '1947', '1945', '1938',
       '1935', '1936', '1926', '1932', '1985', '1979', '1971', '1978',
       '1966', '1962', '1983', '1984', '1931', '1922', '1999', '1927',
       '1929', '1930', '1928', '1925', '1914', '2000', '1919', '1923',
       '1920', '1918', '1921', '2001', '1924', '2002', '2003', '1915',
       '2004', '1916', '1917', nan, '2005', '2006', '1902', '1903',
       '2007', '2008', '2009', '1912', '2010', '1913', '2011', '1898',
       '1899', '1894', '2012', '1910', '2013', '1896', '2014', '2015',
       '1

In [76]:
movies_copy['year'].isna().sum()

np.int64(772)

In [77]:
movies_copy['year'].describe()

count     86813
unique      142
top        2017
freq       3269
Name: year, dtype: object

In [78]:
mask = movies_copy['year'].isna()

movies_copy.loc[mask, 'year'] = np.random.randint(1950, 2026, size=mask.sum())

In [83]:
movies_copy.head(6)

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,title_only,year
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,Toy Story,1995
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Jumanji,1995
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men,1995
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,Waiting to Exhale,1995
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Father of the Bride Part II,1995
5,6,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,Heat,1995


In [84]:
movies = movies_copy

In [85]:
movies.head(5)

Unnamed: 0,movieId,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,...,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,title_only,year
0,1,0,0,1,1,1,1,0,0,0,...,0,0,0,0,0,0,0,0,Toy Story,1995
1,2,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,Jumanji,1995
2,3,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,Grumpier Old Men,1995
3,4,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,0,Waiting to Exhale,1995
4,5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,Father of the Bride Part II,1995
