# MovieLens pripremanje podataka

In [1]:
# Uključivanje biblioteka
import os
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Definiranje lokacije podataka
MOVIELENS = 'dat'
USER_DATA = 'users.dat'
MOVIE_DATA = 'movies.dat'
RATING_DATA = 'ratings.dat'

In [3]:
# Klasificiranje korisničkih godina te zanimanja
AGES = { 1: "Under 18", 18: "18-24", 25: "25-34", 35: "35-44", 45: "45-49", 50: "50-55", 56: "56+" }
OCCUPATIONS = { 0: "other or not specified", 1: "academic/educator", 2: "artist", 3: "clerical/admin",
                4: "college/grad student", 5: "customer service", 6: "doctor/health care",
                7: "executive/managerial", 8: "farmer", 9: "homemaker", 10: "K-12 student", 11: "lawyer",
                12: "programmer", 13: "retired", 14: "sales/marketing", 15: "scientist", 16: "self-employed",
                17: "technician/engineer", 18: "tradesman/craftsman", 19: "unemployed", 20: "writer" }

In [4]:
# Definiranje csv datoteka gdje ce biti spremljeni podaci
USERS_CSV = 'users.csv'
MOVIES_CSV = 'movies.csv'
RATINGS_CSV = 'ratings.csv'

In [5]:
# Čitanje ocjena
ratings = pd.read_csv(os.path.join(MOVIELENS, RATING_DATA), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Postavljanje max_userid na maximum vrijednost user_id unutar ratings podataka
max_userid = ratings['user_id'].drop_duplicates().max() 
# Postavljanje max_movieid na maximum vrijednost movie_id unutar ratings podataka
max_movieid = ratings['movie_id'].drop_duplicates().max()

#ispis velicine datoteke ratings
print (len(ratings), 'rating-a učitano.') 

1000209 rating-a učitano.


In [6]:
# Spremanje ratings u csv file (ratings.csv)#
ratings.to_csv(RATINGS_CSV, 
               sep='\t', 
               header=True, 
               encoding='latin-1', 
               columns=['user_id', 'movie_id', 'rating', 'timestamp', 'user_emb_id', 'movie_emb_id'])
print ('Snimljeno ', RATINGS_CSV)

Snimljeno  ratings.csv


In [7]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [8]:
# Čitanje ocjena
users = pd.read_csv(os.path.join(MOVIELENS, USER_DATA), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['user_id', 'gender', 'age', 'occupation', 'zipcode'])
users['age_desc'] = users['age'].apply(lambda x: AGES[x])
users['occ_desc'] = users['occupation'].apply(lambda x: OCCUPATIONS[x])
print (len(users), 'opisa o', max_userid, 'korisnika učitano.')

6040 opisa o 6040 korisnika učitano.


In [9]:
# Spremanje korisnika u users.csv
users.to_csv(USERS_CSV, 
             sep='\t', 
             header=True, 
             encoding='latin-1',
             columns=['user_id', 'gender', 'age', 'occupation', 'zipcode', 'age_desc', 'occ_desc'])
print ('Spremljeno', USERS_CSV)

Spremljeno users.csv


In [10]:
users.head()

Unnamed: 0,user_id,gender,age,occupation,zipcode,age_desc,occ_desc
0,1,F,1,10,48067,Under 18,K-12 student
1,2,M,56,16,70072,56+,self-employed
2,3,M,25,15,55117,25-34,scientist
3,4,M,45,7,2460,45-49,executive/managerial
4,5,M,25,20,55455,25-34,writer


In [11]:
# Čitanje podataka o filmovima
movies = pd.read_csv(os.path.join(MOVIELENS, MOVIE_DATA), 
                    sep='::', 
                    engine='python', 
                    encoding='latin-1',
                    names=['movie_id', 'title', 'genres'])
print (len(movies), 'opisa o', max_movieid, 'filma učitano.')

3883 opisa o 3952 filma učitano.


In [12]:
# Spremanje filmova u movies.csv
movies.to_csv(MOVIES_CSV, 
              sep='\t', 
              header=True, 
              columns=['movie_id', 'title', 'genres'])
print ('Spremljeno u', MOVIES_CSV)

Spremljeno u movies.csv


In [13]:
movies.head()

Unnamed: 0,movie_id,title,genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
