In [2]:
import pandas as pd
import numpy as np
import datetime as dt

**Read Datasets**

In [3]:
df_movies = pd.read_csv("archive/movies_metadata.csv")
df_ratings = pd.read_csv("archive/ratings_small.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [5]:
df_ratings.shape

(100004, 4)

In [4]:
df_movies.shape

(45466, 24)

In [5]:
df_movies.columns,df_movies.dtypes

(Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
        'imdb_id', 'original_language', 'original_title', 'overview',
        'popularity', 'poster_path', 'production_companies',
        'production_countries', 'release_date', 'revenue', 'runtime',
        'spoken_languages', 'status', 'tagline', 'title', 'video',
        'vote_average', 'vote_count'],
       dtype='object'),
 adult                     object
 belongs_to_collection     object
 budget                    object
 genres                    object
 homepage                  object
 id                        object
 imdb_id                   object
 original_language         object
 original_title            object
 overview                  object
 popularity                object
 poster_path               object
 production_companies      object
 production_countries      object
 release_date              object
 revenue                  float64
 runtime                  float64
 spoken_lan

In [6]:
#Clean the data
df_ratings = df_ratings.drop("timestamp",axis=1)
df_ratings["rating"] = df_ratings["rating"].astype("int")
df_movies = df_movies[df_movies["id"].str.isnumeric()]
df_movies["id"] = df_movies["id"].astype("int")

#How should we populate missing dates???
df_movies.release_date = df_movies.release_date.fillna('1989-12-31')
df_movies = df_movies.drop(df_movies[pd.to_datetime(df_movies.release_date,errors="coerce").isna()].index)
df_movies.release_date = pd.to_datetime(df_movies.release_date)


In [7]:
#Merge the two datasets using an inner to prevent orphaned ids??
df_movies = df_movies.merge(df_ratings,left_on="id",right_on="movieId",how="inner")

In [6]:
df_movies.shape

(45466, 24)

In [9]:
#Create the ratings.csv for librec-auto
df_ratings = df_movies[["userId","movieId","rating"]]
df_ratings.to_csv("ratings.csv",index=None,header=False)

In [10]:
#creating the protected feature bins
avg_vote = df_movies.vote_count.mean()
avg_rev = df_movies.revenue.mean()
avg_runtime = df_movies.runtime.mean()
cutoff_date = dt.datetime.strptime("1990","%Y")

In [11]:
#create dummies for each protected feature
df_pop = pd.get_dummies(df_movies.vote_count.transform( lambda x: "popular" if x >= avg_vote else "unpopular"))
df_date =  pd.get_dummies(df_movies.release_date.transform( lambda x: "old" if x >= cutoff_date else "new"))
df_rev =  pd.get_dummies(df_movies.revenue.transform( lambda x: "higher_revenue" if x >= avg_rev else "lower_revenue"))
df_len =  pd.get_dummies(df_movies.runtime.transform( lambda x: "longer" if x >= avg_runtime else "short"))

In [12]:
df_movies.id, df_date

(0          949
 1          949
 2          949
 3          949
 4          949
          ...  
 44989    64197
 44990    64197
 44991    64197
 44992    98604
 44993    49280
 Name: id, Length: 44994, dtype: int32,
        new  old
 0        0    1
 1        0    1
 2        0    1
 3        0    1
 4        0    1
 ...    ...  ...
 44989    0    1
 44990    0    1
 44991    0    1
 44992    0    1
 44993    1    0
 
 [44994 rows x 2 columns])

In [None]:
#create full item feature table
df_item_features = pd.concat([df_movies["id"],df_pop,df_date,df_rev,df_len], axis=1)
df_item_features = df_item_features.set_index("id").stack().reset_index()
df_item_features.columns = ["item_id","feature","value"]
df_item_features.shape 

In [17]:
df_item_features = df_item_features[df_item_features.value == 1]
df_item_features = df_item_features.drop_duplicates()
df_item_features.shape

(11320, 3)

In [99]:
#Remove duplicates
# df_item_features = df_item_features[~df_item_features.duplicated()]
# df_item_features[~df_item_features.duplicated()].shape,df_item_features[df_item_features.duplicated()].shape

((22640, 3), (0, 3))

In [18]:
#Single out specifc feature
df_item_features = df_item_features[(df_item_features["feature"]=='old') | (df_item_features["feature"]=='new') ]


In [19]:
df_item_features.to_csv("item-features.csv",header=False,index=None)