In [1]:
data_fn = 'data/ml_sample_6000.dat'

In [2]:
import pandas as pd
import numpy as np

In [3]:
ml_df = pd.read_csv(data_fn, header=None)
print (ml_df.head())

        0     1    2           3
0  158052   910  4.0  1186302858
1  158052  3052  4.0  1186302806
2  158052  1269  0.5  1186302929
3  158052   913  4.5  1186302865
4  158052  1663  4.0  1186302906


### get the itemids and get the features related to these itemids
### QQQ: if adult movies are minority movies, should we recommend them?

adult, budget, genre, spoken language, popularity, production country (us/non us), release date, runtime, revenue

In [4]:
ml_meta_df = pd.read_csv('./data/ML-26M/movies_metadata.csv',
                         usecols=['imdb_id', 'adult', 'genres',
                                  'original_language', 'release_date',
                                  'revenue', 'runtime', 'popularity',
                                  'production_countries', 'spoken_languages' ], dtype=str)
ml_meta_df.head(2)

Unnamed: 0,adult,genres,imdb_id,original_language,popularity,production_countries,release_date,revenue,runtime,spoken_languages
0,False,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",tt0114709,en,21.946943,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]"
1,False,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",tt0113497,en,17.015539,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso..."


In [5]:
ml_meta_df.dtypes

adult                   object
genres                  object
imdb_id                 object
original_language       object
popularity              object
production_countries    object
release_date            object
revenue                 object
runtime                 object
spoken_languages        object
dtype: object

In [6]:
ml_meta_df.describe(include="all")

Unnamed: 0,adult,genres,imdb_id,original_language,popularity,production_countries,release_date,revenue,runtime,spoken_languages
count,45466,45466,45449,45455,45461.0,45463,45379,45460,45203.0,45460
unique,5,4069,45417,92,43758.0,2393,17336,6863,353.0,1931
top,False,"[{'id': 18, 'name': 'Drama'}]",tt1180333,en,0.0,"[{'iso_3166_1': 'US', 'name': 'United States o...",2008-01-01,0,90.0,"[{'iso_639_1': 'en', 'name': 'English'}]"
freq,45454,5000,3,32269,66.0,17851,136,38052,2556.0,22395


In [7]:
ml_meta_df[ml_meta_df['imdb_id'] == 'tt0000075']

Unnamed: 0,adult,genres,imdb_id,original_language,popularity,production_countries,release_date,revenue,runtime,spoken_languages
44646,False,[],tt0000075,fr,0.788113,"[{'iso_3166_1': 'FR', 'name': 'France'}]",1896-01-01,0,1.0,"[{'iso_639_1': 'xx', 'name': 'No Language'}]"


In [104]:
ml_meta_df.isnull().sum()

adult                     0
genres                    0
imdb_id                  17
original_language        11
popularity                5
production_countries      3
release_date             87
revenue                   6
runtime                 263
spoken_languages          6
dtype: int64

### filling the missing values

popularity

In [105]:
# delete this row
ml_meta_df.drop(35587, inplace=True)

# casting it from str to float
ml_meta_df['popularity'] = ml_meta_df['popularity'].astype(float)

# calculate mean pop for filling in the missing values
mean_pop = (ml_meta_df.loc[ml_meta_df['popularity'].isnull() == False, 'popularity']).mean()

# filling the missing value with mean
ml_meta_df['popularity'].fillna(mean_pop, inplace=True)

# how many missing values?
print('missing?', ml_meta_df['popularity'].isnull().sum())

print('# above mean', len(ml_meta_df[ml_meta_df['popularity'] >= mean_pop]))
print('# below mean', len(ml_meta_df[ml_meta_df['popularity'] < mean_pop]))

missing? 0
# above mean 13076
# below mean 32389


handling missing values in revenue

In [106]:
ml_meta_df['revenue'] = ml_meta_df['revenue'].astype(float)
mean_rev = (ml_meta_df.loc[ml_meta_df['revenue'].isnull() == False, 'revenue']).mean()
ml_meta_df['revenue'].fillna(mean_rev, inplace=True)
print('missing?', ml_meta_df['revenue'].isnull().sum())

print('# above mean', len(ml_meta_df[ml_meta_df['revenue'] >= mean_rev]))
print('# below mean', len(ml_meta_df[ml_meta_df['revenue'] < mean_rev]))

missing? 0
# above mean 4215
# below mean 41250


In [107]:
ml_meta_df['runtime'] = ml_meta_df['runtime'].astype(float)
mean_rt = (ml_meta_df.loc[ml_meta_df['runtime'].isnull() == False, 'runtime']).mean()
median_rt =(ml_meta_df.loc[ml_meta_df['runtime'].isnull() == False, 'runtime']).median()

print (mean_rt)
print (median_rt)

ml_meta_df['runtime'].fillna(mean_rt, inplace=True)
print('missing?', ml_meta_df['runtime'].isnull().sum())

print('# above mean', len(ml_meta_df[ml_meta_df['runtime'] >= mean_rt]))
print('# below mean', len(ml_meta_df[ml_meta_df['runtime'] < mean_rt]))

94.12819945578833
95.0
missing? 0
# above mean 23047
# below mean 22418


In [108]:
print('missing?', ml_meta_df['release_date'].isnull().sum())
ml_meta_df.drop(ml_meta_df[ml_meta_df['release_date'].isnull()==True].index.tolist(), inplace=True)
ml_meta_df.drop(ml_meta_df[ml_meta_df['release_date'] == '1'].index.tolist(), inplace=True)
ml_meta_df.drop(ml_meta_df[ml_meta_df['release_date'] == '12'].index.tolist(), inplace=True)
# deleting them!

print('missing?', ml_meta_df['release_date'].isnull().sum())

missing? 87
missing? 0


In [109]:
ml_meta_df.columns.values

array(['adult', 'genres', 'imdb_id', 'original_language', 'popularity',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages'], dtype=object)

In [110]:
ml_meta_df.shape

(45376, 10)

In [111]:
from datetime import datetime
import re

feature extraction

In [123]:
# we want the data frame to create item vectors
# we want also the dictionary later

newDF = pd.DataFrame(columns=['movieid', 'feature'])
newDF_dict = {}

ml_meta_df.reset_index(drop=True, inplace=True)

for i in range(ml_meta_df.shape[0]):
    
    dict_values = {}
    features = [] # add everything as a feature. we end up with movieid, feature format
    
    dict_values['genres'] = []
    # genre
    gs = re.findall(r"'name':\s\'(.*?)\'", ml_meta_df.loc[i, 'genres'])
    features.extend(gs)
    dict_values['genres'].extend(gs)
    
    # adult genre
    if ml_meta_df.loc[i, 'adult'] == 'True':
        features.append('Adult')
        dict_values['genres'].append('Adult')

    
    # popularity
    if ml_meta_df.loc[i, 'popularity'] > mean_pop:
        features.append('pop')
        dict_values['popularity'] = 'pop'
    else:
        features.append('notpop')
        dict_values['popularity'] = 'notpop'
    
    # movie year: old or new
#     ml_meta_df['release_date'] = pd.to_datetime(ml_meta_df['release_date'])

    
    rtime = datetime.strptime(ml_meta_df.loc[i, 'release_date'], '%Y-%m-%d')
    if rtime.year > 1990:
#     if ml_meta_df.loc[i, 'release_date'].year > 1990:
        features.append('new')
        dict_values['year'] = 'new'
    else:
        features.append('old')
        dict_values['year'] = 'old'
        
    
    
    # revenue: high or low
#     dict_values['revenue'] = 'low'
    if ml_meta_df.loc[i, 'revenue'] > mean_rev:
        features.append('high')
        dict_values['revenue'] = 'high'
    else:
        features.append('low')
        dict_values['revenue'] = 'low'
        
        
    # runtime: long or short
#     dict_values['runtime'] = 'short'
    if ml_meta_df.loc[i, 'runtime'] > mean_rt:
        features.append('long') 
        dict_values['runtime'] = 'long'
    else:
        features.append('short')
        dict_values['runtime'] = 'short'
    
    # original language not used, instead spoken language is used. 'cause correlated feature
#     if ml_meta_df.loc[i, 'original_language'] == 'en':
#         features.append('en')
    dict_values['language'] = []
    ls = re.findall(r"'iso_639_1':\s\'(.*?)\'", ml_meta_df.loc[i, 'spoken_languages'])
    features.extend(ls) 
    dict_values['language'].extend(ls) 
    
    # production countries: US, UK, etc.
    dict_values['production_countries'] = []
    cs = re.findall(r"'iso_3166_1':\s\'(.*?)\'", ml_meta_df.loc[i, 'production_countries'])
    features.extend(cs)
    dict_values['production_countries'].extend(cs)
    
    
    # dictionary of features where movie id is the key
    newDF_dict[ml_meta_df.loc[i, 'imdb_id']] = dict_values
    
    # save the dataframe
    for f in features:
        newDF = newDF.append({'movieid': ml_meta_df.loc[i, 'imdb_id'],
                              'feature': f}, ignore_index =  True) 

# SAVE AS DICTIONARY
# np.save('./data/ML-26M/movieid_genre_dictionary.npy', newDF_dict)
# read dictionary
# read_dictionary = np.load('./data/ML-26M/movieid_genre_dictionary.npy',allow_pickle='TRUE').item()
   
# SAVE AS DATAFRAME
newDF.head(10)
newDF.to_csv('./data/ML-26M/movieid_feature_df.csv', index=None)
# for a,b in newDF.groupby('feature'):
#     print (a, len(b))

In [124]:
movie_genre_matrix = pd.crosstab(newDF['movieid'], newDF['feature'])
movie_genre_matrix.head()

feature,AE,AF,AL,AM,AN,AO,AQ,AR,AT,AU,...,uk,ur,uz,vi,wo,xh,xx,yi,zh,zu
movieid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0000001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
tt0000003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
tt0000005,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
tt0000008,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
tt0000010,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [125]:
movie_genre_matrix.to_csv('./data/ML-26M/movie_genre_matrix.csv')

# success!!

### now we want to calculate the tolerance weight?

writing a function that returns the number of labels in a feature

- genres 21
- popularity 2
- year 2
- revenue 2
- runtime 2
- language 133
- production_countries 161

In [None]:
def label_count(a_feature):
    # for example, genre feature has 5 levels of adult, thriller, ...
    
    return n_labels

In [None]:
# def tolerance_weight(userid):



# multiplying weights by these matrix