# Imports

In [127]:
import pandas as pd

# Exploratory Data Analysis

Let us look at the data.

## Users Data

In [128]:
users_columns = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

df_users = pd.read_csv(
    '../data/raw/ml-100k/u.user',
    sep='|',
    encoding='latin-1',
    index_col=0,
    names=users_columns
)

print(df_users.shape)
df_users.head()

(943, 4)


Unnamed: 0_level_0,age,sex,occupation,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043
3,23,M,writer,32067
4,24,M,technician,43537
5,33,F,other,15213


## Ratings Data

In [129]:
ratings_columns = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

df_ratings = pd.read_csv(
    '../data/raw/ml-100k/u.data',
    sep='\t',
    encoding='latin-1',
    index_col=0,
    names=ratings_columns
)

print(df_ratings.shape)
df_ratings.head()

(100000, 3)


Unnamed: 0_level_0,movie_id,rating,unix_timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
196,242,3,881250949
186,302,3,891717742
22,377,1,878887116
244,51,2,880606923
166,346,1,886397596


## Items Data

In [130]:
items_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'IMDb URL',
                 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime',
                 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery',
                 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

df_items = pd.read_csv(
    '../data/raw/ml-100k/u.item',
    sep='|',
    encoding="latin-1",
    index_col=0,
    names=items_columns
)

print(df_items.shape)
df_items.head()

(1682, 23)


Unnamed: 0_level_0,title,release_date,video_release_date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


## Remove Poor and Unused Data

In [131]:
df_items[df_items['unknown']==1]

Unnamed: 0_level_0,title,release_date,video_release_date,IMDb URL,unknown,Action,Adventure,Animation,Children's,Comedy,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
267,unknown,,,,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1373,Good Morning (1971),4-Feb-1971,,http://us.imdb.com/M/title-exact?Good%20Mornin...,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
df_ratings[df_ratings['movie_id'].isin([267, 1373])]

Unnamed: 0_level_0,movie_id,rating,unix_timestamp
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
130,267,5,875801239
5,267,4,875635064
268,267,3,875742077
181,1373,1,878962052
297,267,3,875409139
319,267,4,875707690
1,267,4,875692955
532,267,3,875441348
833,267,1,875655669
422,267,4,875655986


There are two movies with unknown genre. We see there are few ratings for them, so let us remove these movies and ratings from the dataset.

We will not use some columns, so let us also leave only the ones needed to build the user vectors.

In [133]:
# remove unused columns
df_items = df_items.drop(columns=['release_date', 'video_release_date', 'IMDb URL'])
df_ratings = df_ratings.drop(columns=['unix_timestamp'])
df_users = df_users.drop(columns=['zip_code'])

In [134]:
# remove unknown genre films
print(f'before removal: {df_items.shape[0]} movies and {df_ratings.shape[0]} ratings')
unknown_genre_movies_ids = df_items[df_items['unknown'] == 1].index
df_items = df_items.drop(unknown_genre_movies_ids)
df_ratings = df_ratings.loc[~df_ratings['movie_id'].isin(unknown_genre_movies_ids)]
print(f'after removal: {df_items.shape[0]} movies and {df_ratings.shape[0]} ratings')

before removal: 1682 movies and 100000 ratings
after removal: 1680 movies and 99990 ratings


In [135]:
# re-index the user_id column
df_ratings.index = df_ratings.index - 1
df_users.index = df_users.index - 1

Let us save the preprocessed data:

In [136]:
df_items.to_csv('../data/interim/preprocessed/items.csv')
df_ratings.to_csv('../data/interim/preprocessed/ratings.csv')
df_users.to_csv('../data/interim/preprocessed/users.csv')