In [1]:
import pandas as pd
import filtering

In [2]:
raw_movie_data = pd.read_csv("./data/movies.csv")

In [3]:
def movie_filter(input_data, year=0, genres=None):
    data = input_data.copy() 
    
    data['year'] = data['title'].str[-5:-1]
    data['year'] = pd.to_numeric(data['year'], downcast='integer', errors='coerce')
    data = data.dropna(axis=0)    
    
    data['genres'] = data['genres'].str.split('|')
    
    data = data[data['year']>=year]
    if genres==None:
        return data.reset_index(drop=True)
    mask = []
    for genre_ in data['genres']:
        mask.append(True if np.intersect1d(genre_, genres).size > 0 else False) 
            
    data = data[mask]
    
    return data.reset_index(drop=True)

In [4]:
movie_data = movie_filter(raw_movie_data)

In [5]:
movie_data.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story (1995),"[Adventure, Animation, Children, Comedy, Fantasy]",1995.0
1,2,Jumanji (1995),"[Adventure, Children, Fantasy]",1995.0
2,3,Grumpier Old Men (1995),"[Comedy, Romance]",1995.0
3,4,Waiting to Exhale (1995),"[Comedy, Drama, Romance]",1995.0
4,5,Father of the Bride Part II (1995),[Comedy],1995.0


In [6]:
len(movie_data)

61861

In [7]:
raw_rating_data = pd.read_csv("./data/ratings.csv")

In [8]:
raw_rating_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,1147880044
1,1,306,3.5,1147868817
2,1,307,5.0,1147868828
3,1,665,5.0,1147878820
4,1,899,3.5,1147868510


In [9]:
raw_rating_data.tail()

Unnamed: 0,userId,movieId,rating,timestamp
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434
25000094,162541,63876,5.0,1240952515


In [10]:
rating_data_newest = raw_rating_data.loc[24000000:25000094,:]

In [11]:
rating_data_newest.head()

Unnamed: 0,userId,movieId,rating,timestamp
24000000,155963,4936,4.0,1020605031
24000001,155963,4942,3.0,1020605865
24000002,155963,4969,4.0,1020249598
24000003,155963,4993,4.0,1020249667
24000004,155963,5012,4.0,1020605202


In [12]:
rating_data_newest.tail()

Unnamed: 0,userId,movieId,rating,timestamp
25000090,162541,50872,4.5,1240953372
25000091,162541,55768,2.5,1240951998
25000092,162541,56176,2.0,1240950697
25000093,162541,58559,4.0,1240953434
25000094,162541,63876,5.0,1240952515


In [13]:
raw_pivot = rating_data_newest.pivot(index = 'userId', columns='movieId', values='rating')

In [14]:
raw_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,208291,208465,208693,208695,208697,208699,208701,208737,208800,208893
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
155963,,,,,,,,,,,...,,,,,,,,,,
155964,4.0,,,,,,,,,,...,,,,,,,,,,
155965,3.0,,,,,,,2.0,,3.0,...,,,,,,,,,,
155966,,,,,,,,,,,...,,,,,,,,,,
155967,,,,,,,,,,,...,,,,,,,,,,
155968,3.0,,3.0,,3.0,3.0,2.0,,3.0,,...,,,,,,,,,,
155969,4.0,,,,,,,,,,...,,,,,,,,,,
155970,4.5,,,,3.0,,4.0,,,4.0,...,,,,,,,,,,
155971,,,,,,,,,,,...,,,,,,,,,,
155972,5.0,,2.0,,,,,,,,...,,,,,,,,,,


In [15]:
def rating_filter(input_data, threshold = 100):
    data = input_data.copy()
    #threshold개 이상 평가를 남긴 row만 남김
    x = data['userId'].value_counts()>=threshold
    y = set(x[x].index)
    data = data[data['userId'].isin(y)]
    
    return data.reset_index(drop=True)

In [16]:
rating_data_newest_cert = rating_filter(rating_data_newest, 100)

In [17]:
rating_data_newest_cert

Unnamed: 0,userId,movieId,rating,timestamp
0,155965,1,3.0,1014163756
1,155965,8,2.0,1014163384
2,155965,10,3.0,1014162573
3,155965,34,3.0,1014163947
4,155965,44,1.0,1014163005
5,155965,60,1.0,1013646717
6,155965,104,4.0,1014164118
7,155965,107,3.0,1014163427
8,155965,110,5.0,1014162481
9,155965,112,3.0,1014162513


In [18]:
raw_pivot = rating_data_newest_cert.pivot(index = 'userId', columns='movieId', values='rating')

In [19]:
raw_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,208291,208465,208693,208695,208697,208699,208701,208737,208800,208893
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
155965,3.0,,,,,,,2.0,,3.0,...,,,,,,,,,,
155968,3.0,,3.0,,3.0,3.0,2.0,,3.0,,...,,,,,,,,,,
155969,4.0,,,,,,,,,,...,,,,,,,,,,
155970,4.5,,,,3.0,,4.0,,,4.0,...,,,,,,,,,,
155972,5.0,,2.0,,,,,,,,...,,,,,,,,,,
155973,,,,,,,,,,,...,,,,,,,,,,
155974,5.0,,,,,,,,,,...,,,,,,,,,,
155975,4.0,,,,,,,,,,...,,,,,,,,,,
155976,3.0,,,,,,,,,,...,,,,,,,,,,
155977,,,,,,,,,,,...,,,,,,,,,,


In [20]:
rating_data_newest_cert.to_csv("./data/rating_cert.csv")

In [21]:
raw_rating_data = pd.read_csv("./data/rating_cert.csv")

In [22]:
raw_pivot = raw_rating_data.pivot(index = 'userId', columns='movieId', values='rating')

In [23]:
raw_pivot.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,208291,208465,208693,208695,208697,208699,208701,208737,208800,208893
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
155965,3.0,,,,,,,2.0,,3.0,...,,,,,,,,,,
155968,3.0,,3.0,,3.0,3.0,2.0,,3.0,,...,,,,,,,,,,
155969,4.0,,,,,,,,,,...,,,,,,,,,,
155970,4.5,,,,3.0,,4.0,,,4.0,...,,,,,,,,,,
155972,5.0,,2.0,,,,,,,,...,,,,,,,,,,


In [24]:
raw_pivot.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,208291,208465,208693,208695,208697,208699,208701,208737,208800,208893
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
162533,4.5,4.0,,,,4.5,,,,4.0,...,,,,,,,,,,
162534,4.0,,,,,,,,,,...,,,,,,,,,,
162537,,,,,,,,,,,...,,,,,,,,,,
162538,2.0,,,,,,,,,,...,,,,,,,,,,
162541,,,,,,,,,,,...,,,,,,,,,,


In [25]:
raw_pivot

movieId,1,2,3,4,5,6,7,8,9,10,...,208291,208465,208693,208695,208697,208699,208701,208737,208800,208893
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
155965,3.0,,,,,,,2.0,,3.0,...,,,,,,,,,,
155968,3.0,,3.0,,3.0,3.0,2.0,,3.0,,...,,,,,,,,,,
155969,4.0,,,,,,,,,,...,,,,,,,,,,
155970,4.5,,,,3.0,,4.0,,,4.0,...,,,,,,,,,,
155972,5.0,,2.0,,,,,,,,...,,,,,,,,,,
155973,,,,,,,,,,,...,,,,,,,,,,
155974,5.0,,,,,,,,,,...,,,,,,,,,,
155975,4.0,,,,,,,,,,...,,,,,,,,,,
155976,3.0,,,,,,,,,,...,,,,,,,,,,
155977,,,,,,,,,,,...,,,,,,,,,,


In [26]:
raw_pivot.columns

Int64Index([     1,      2,      3,      4,      5,      6,      7,      8,
                 9,     10,
            ...
            208291, 208465, 208693, 208695, 208697, 208699, 208701, 208737,
            208800, 208893],
           dtype='int64', name='movieId', length=22385)

In [27]:
st = set(raw_pivot.columns)

In [28]:
len(movie_data)

61861

In [29]:
movie_data = movie_data[movie_data['movieId'].isin(st)]

In [30]:
len(movie_data)

22273

In [31]:
mask = [raw_pivot.count(axis = 0).to_list() < 20]

TypeError: '<' not supported between instances of 'list' and 'int'

In [32]:
movie_count = raw_pivot.count(axis = 0)

In [33]:
movies_to_erase = []
for c in raw_pivot.columns.to_list():
    if movie_count[c]<20 : 
        movies_to_erase.append(c)

In [34]:
movies_to_erase

[37,
 38,
 49,
 51,
 53,
 56,
 59,
 67,
 75,
 77,
 83,
 84,
 90,
 96,
 98,
 106,
 108,
 114,
 119,
 120,
 121,
 124,
 127,
 128,
 129,
 130,
 131,
 136,
 137,
 138,
 146,
 148,
 152,
 167,
 178,
 184,
 189,
 192,
 197,
 200,
 201,
 202,
 210,
 212,
 219,
 220,
 226,
 228,
 243,
 244,
 245,
 251,
 254,
 263,
 264,
 283,
 284,
 285,
 286,
 287,
 295,
 297,
 298,
 301,
 304,
 310,
 311,
 320,
 324,
 325,
 331,
 336,
 341,
 359,
 375,
 385,
 386,
 389,
 391,
 392,
 394,
 396,
 397,
 398,
 400,
 402,
 404,
 406,
 408,
 409,
 411,
 430,
 439,
 443,
 447,
 449,
 462,
 463,
 467,
 470,
 476,
 478,
 486,
 488,
 495,
 496,
 499,
 503,
 525,
 526,
 530,
 559,
 561,
 563,
 564,
 566,
 567,
 570,
 571,
 572,
 573,
 576,
 579,
 582,
 583,
 591,
 598,
 600,
 601,
 602,
 614,
 615,
 617,
 618,
 619,
 621,
 626,
 630,
 632,
 633,
 634,
 635,
 636,
 638,
 641,
 645,
 649,
 651,
 652,
 657,
 658,
 659,
 660,
 664,
 666,
 675,
 676,
 681,
 682,
 685,
 687,
 690,
 696,
 698,
 701,
 702,
 703,
 705,
 706,
 

In [35]:
raw_pivot_temp = raw_pivot.copy()

In [36]:
raw_pivot_temp= raw_pivot_temp.drop(movies_to_erase, axis=1)

In [37]:
raw_pivot_temp

movieId,1,2,3,4,5,6,7,8,9,10,...,194951,195159,196889,196997,197175,197711,200818,201773,202429,204698
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
155965,3.0,,,,,,,2.0,,3.0,...,,,,,,,,,,
155968,3.0,,3.0,,3.0,3.0,2.0,,3.0,,...,,,,,,,,,,
155969,4.0,,,,,,,,,,...,,,,,,,,,,
155970,4.5,,,,3.0,,4.0,,,4.0,...,,,,,,,,,,
155972,5.0,,2.0,,,,,,,,...,,,,,,,,,,
155973,,,,,,,,,,,...,,,,,,,,,4.5,
155974,5.0,,,,,,,,,,...,,,,,,,,,,
155975,4.0,,,,,,,,,,...,,,,,,,,,,
155976,3.0,,,,,,,,,,...,,,,,,,,,,
155977,,,,,,,,,,,...,,,,,,,,,,


In [38]:
raw_pivot_temp.to_csv("./data/pivot.csv")