In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# data paths
small_path = '/scratch/work/courses/DSGA1004-2021/movielens/ml-latest-small'
full_path = '/scratch/work/courses/DSGA1004-2021/movielens/ml-latest'

small:
- links.csv (movieId,imdbId,tmdbId)
- movies.csv (movieId,title,genres)
- ratings.csv (userId,movieId,rating,timestamp)
- tags.csv (userId,movieId,tag,timestamp)


full:
- links.csv
- movies.csv
- ratings.csv
- tags.csv
- genome-scores.csv
- genome-tags.csv

## ml-latest-small

In [3]:
# import ratings from small dataset
ratings_small = pd.read_csv(small_path + '/ratings.csv')
print(ratings_small.shape)
ratings_small.head()

(100836, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
# stratify split
train_small, test_val_small = train_test_split(ratings_small, stratify=ratings_small['userId'], test_size=0.2)
test_small, val_small = train_test_split(test_val_small, stratify=test_val_small['userId'], test_size=0.5)

In [5]:
train_small.head

<bound method NDFrame.head of        userId  movieId  rating   timestamp
23635     160     4370     4.0  1065992012
99816     610     4270     3.0  1493845935
37109     249    85780     4.0  1368891309
74536     474     6042     3.0  1069685537
72065     464      653     2.5  1287400499
...       ...      ...     ...         ...
54542     358    81591     2.0  1339539299
47846     309     1198     4.0  1166068294
78757     489     2628     1.0  1333101402
20303     134       48     3.0   832841524
7944       54      318     4.0   830247358

[80668 rows x 4 columns]>

In [6]:
test_small.head

<bound method NDFrame.head of        userId  movieId  rating   timestamp
69527     448     6503     1.0  1105009312
66096     425     3948     3.5  1085490893
65298     419       47     4.0  1321659049
46041     305     1253     5.0  1460366849
51818     335      466     3.0  1261541271
...       ...      ...     ...         ...
10120      66     1862     1.0  1113190775
13189      84      527     5.0   857653594
63782     414     4818     2.0  1091716530
35897     244     1228     4.0   975075169
37746     256     1270     5.0  1447001868

[10084 rows x 4 columns]>

In [7]:
val_small.head

<bound method NDFrame.head of        userId  movieId  rating   timestamp
89689     580     8950     4.5  1167789905
48183     312     3701     3.0  1043176884
79331     492      619     3.0   863976722
77116     482     1037     3.5  1105396623
47612     307    48518     2.0  1189608059
...       ...      ...     ...         ...
12350      76    34405     5.0  1439165874
21675     140    43396     4.0  1166645387
77578     483     3785     3.0  1215896116
9020       62    86880     4.0  1523786545
63774     414     4776     4.0  1017669912

[10084 rows x 4 columns]>

In [8]:
# Check that stratification worked
print('Number of unique users in original dataset:', ratings_small['userId'].nunique())
print('Number of unique users in train dataset:', train_small['userId'].nunique())
print('Number of unique users in test dataset:', test_small['userId'].nunique())
print('Number of unique users in val dataset:', val_small['userId'].nunique())

Number of unique users in original dataset: 610
Number of unique users in train dataset: 610
Number of unique users in test dataset: 610
Number of unique users in val dataset: 610


In [20]:
# save to csv!
train_small.to_csv('ratings_small_train.csv')
test_small.to_csv('ratings_small_test.csv')
val_small.to_csv('ratings_small_val.csv')

## ml-latest

In [3]:
# repeat for full dataset
# Note: selected users have at least 1 movie (need to put these into training set before splitting the rest?)

# import ratings from full dataset
ratings_full = pd.read_csv(full_path + '/ratings.csv')
print(ratings_full.shape)
ratings_full.head()

(27753444, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,307,3.5,1256677221
1,1,481,3.5,1256677456
2,1,1091,1.5,1256677471
3,1,1257,4.5,1256677460
4,1,1449,4.5,1256677264


In [4]:
# count number of users with less than 3 movie ratings
user_counts = ratings_full['userId'].value_counts()

In [6]:
# for testing
user_counts[148]

48

In [5]:
low_rating_users = user_counts[user_counts < 3]

In [6]:
low_rating_users #9571 users

242684    2
25118     2
7072      2
42446     2
196252    2
         ..
6870      1
15905     1
121576    1
105187    1
62858     1
Name: userId, Length: 9571, dtype: int64

In [7]:
# total number of users
print(ratings_full['userId'].nunique())

# calculate percentage of users with less than 3 ratings
print(ratings_full['userId'].nunique() / len(low_rating_users))

283228
29.592310103437466


In [8]:
# get rows corresponding to these users
# try merging columns with dataset

# put low rating_users into dataframe
low_rating_userIds = pd.DataFrame(low_rating_users)

# set index as a column
low_rating_userIds.reset_index(inplace=True)

# rename column
low_rating_userIds.rename(columns={'userId':'ratings_count', 'index': 'userId'}, inplace=True)

# set index as a column
low_rating_userIds

Unnamed: 0,userId,ratings_count
0,242684,2
1,25118,2
2,7072,2
3,42446,2
4,196252,2
...,...,...
9566,6870,1
9567,15905,1
9568,121576,1
9569,105187,1


In [9]:
# merge with existing dataframe
merged_df = ratings_full.merge(low_rating_userIds, how='outer', on='userId')

# column ratings_count = NaN where ratings are 3 or greater

In [10]:
merged_df

Unnamed: 0,userId,movieId,rating,timestamp,ratings_count
0,1,307,3.5,1256677221,
1,1,481,3.5,1256677456,
2,1,1091,1.5,1256677471,
3,1,1257,4.5,1256677460,
4,1,1449,4.5,1256677264,
...,...,...,...,...,...
27753439,283228,8542,4.5,1379882795,
27753440,283228,8712,4.5,1379882751,
27753441,283228,34405,4.5,1379882889,
27753442,283228,44761,4.5,1354159524,


In [11]:
merged_df['ratings_count'].fillna(-1, inplace=True)
merged_df

Unnamed: 0,userId,movieId,rating,timestamp,ratings_count
0,1,307,3.5,1256677221,-1.0
1,1,481,3.5,1256677456,-1.0
2,1,1091,1.5,1256677471,-1.0
3,1,1257,4.5,1256677460,-1.0
4,1,1449,4.5,1256677264,-1.0
...,...,...,...,...,...
27753439,283228,8542,4.5,1379882795,-1.0
27753440,283228,8712,4.5,1379882751,-1.0
27753441,283228,34405,4.5,1379882889,-1.0
27753442,283228,44761,4.5,1354159524,-1.0


Try splitting randomly and see if training set has all 283228 users.

In [8]:
# split regularly (not stratified)
train_full, test_val_full = train_test_split(ratings_full, test_size=0.2)
#test_small, val_small = train_test_split(test_val_small, stratify=test_val_small['userId'], test_size=0.5)



In [9]:
train_full['userId'].nunique() # 281906 != 283228

281906

When we tried to split the full dataframe based on the number of counts, it caused the kernel to die when using both pandas and Dask. So we split the data using Spark instead. We will now import these 2 csv files, use sklearn to split the file of ratings for users who have 3 or more ratings, and append the training set to the users who have 2 or less ratings. 