In [1]:
# import dependencies
import pandas as pd
import nltk

# will need to uncomment the following the first time you run
# nltk.download('punkt')

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [4]:
r_cols = ["user id", "item id", "rating", "timestamp"]
ratings = pd.read_csv('../ml-100k/u.data', sep='\t', names=r_cols,encoding='latin-1')
ratings.head()

Unnamed: 0,user id,item id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
u_cols = ["user id", "age", "gender", "occupation", "zip code"]
users = pd.read_csv('../ml-100k/u.user', sep='|', names=u_cols,encoding='latin-1')
users.head()

Unnamed: 0,user id,age,gender,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [11]:
i_cols = ["item id", "movie title", "release date", "video release date", "IMDb URL", "unknown", "action", "adventure", "animation", "children's", "comedy", "crime", "documentary", "drama", "fantasy", "film-noir", "horror", "musical", "mystery", "romance", "sci-fi", "thriller", "war", "western"]
items = pd.read_csv('../ml-100k/u.item', sep='|', names=i_cols,encoding='latin-1')
items.head()

Unnamed: 0,item id,movie title,release date,video release date,IMDb URL,unknown,action,adventure,animation,children's,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [33]:
# Make each user id a unique row?

# merge movies and ratings
df = pd.merge(ratings, movies, on="movieId")
df.sort_values("userId").head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
12117,1,2389,2.0,964983094,Psycho (1998),Crime|Horror|Thriller
12126,1,2395,5.0,964981093,Rushmore (1998),Comedy|Drama
12182,1,2406,4.0,964982310,Romancing the Stone (1984),Action|Adventure|Comedy|Romance
776,1,70,3.0,964982400,From Dusk Till Dawn (1996),Action|Comedy|Horror|Thriller


In [34]:
df['year'] = df['title'].str.extract('.*\((.*)\).*')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [35]:
# prep the year column to convert to int
df = df.replace(to_replace='2006–2007', value='2007')
df = df.fillna(0)
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
3,15,1,2.5,1510577970,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995
4,17,1,4.5,1305696483,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995


In [36]:
# split the genres out from the pipe delimeters and extract year from title into new column
df['genres'] = df['genres'].str.lower()
df['genres'] = df['genres'].str.split("|")

In [37]:
df['genres']=[" ".join(word) for word in df['genres'].values]

In [39]:
v = TfidfVectorizer()
x = v.fit_transform(df['genres'])
df['genresVect'] = list(x.toarray())
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres,year,genresVect
0,1,1,4.0,964982703,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
1,5,1,4.0,847434962,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
2,7,1,4.5,1106635946,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
3,15,1,2.5,1510577970,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
4,17,1,4.5,1305696483,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
5,18,1,3.5,1455209816,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
6,19,1,4.0,965705637,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
7,21,1,3.5,1407618878,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
8,27,1,3.0,962685262,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
9,31,1,5.0,850466616,Toy Story (1995),adventure animation children comedy fantasy,1995,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."


In [23]:
# convert the tags or the genres to numerical using hashing?

In [24]:
# convert to int
df['rating'] = df['rating'].astype(int) 
target = df["rating"]

In [25]:
df['userId'] = df['userId'].astype(int) 
df['movieId'] = df['movieId'].astype(int) 
df['year'] = df['year'].astype(int) 

In [41]:
# data = df[["userId", "movieId", "year", "genresVect"]]
data = df[["genresVect"]]
feature_names = data.columns
data.head()

Unnamed: 0,genresVect
0,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
1,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
2,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
3,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."
4,"[0.0, 0.3638850174997462, 0.5497352121009116, ..."


In [42]:
X_train, X_test, y_train, y_test = train_test_split(data, target, random_state=42)

In [43]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

ValueError: setting an array element with a sequence.

In [None]:
importances = rf.feature_importances_
importances

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
sns.barplot(x=importances, y=feature_names)
plt.xlabel('Feature Importance Score')
plt.ylabel('Features')
plt.title("Features by Importance")
plt.legend()
plt.show()