In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from collections import Counter
import tensorflow as tf

import os
import pickle
import re
from tensorflow.python.ops import math_ops

from matplotlib import pyplot as plt

import seaborn as sns

%matplotlib inline

In [47]:
users_title = ['UserID', 'Gender', 'Age', 'JobID', 'Zip-code']
users = pd.read_table('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
users = users.filter(regex='UserID|Gender|Age|JobID')
users_orig = users.values
#改变User数据中性别和年龄
gender_map = {'F':0, 'M':1}
users['Gender'] = users['Gender'].map(gender_map)

age_map = {item:index for index,item in enumerate([1,18,25,35,45,50,56])}
users['Age'] = users['Age'].map(age_map)

In [48]:
#读取Movie数据集
movies_title = ['MovieID', 'Title', 'Genres']
movies = pd.read_table('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
movies_orig = movies.values

In [49]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [50]:
#将Title中的年份去掉
pattern = re.compile(r'^(.*)\((\d+)\)$')
title_map = {item:pattern.match(item).group(1) for index,item in enumerate(set(movies['Title']))}
movies['Title'] = movies['Title'].map(title_map)

In [51]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story,Animation|Children's|Comedy
1,2,Jumanji,Adventure|Children's|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama
4,5,Father of the Bride Part II,Comedy


In [52]:
max_title_map = {item:len(item.split()) for index,item in enumerate(set(movies['Title']))}

max(max_title_map.items(), key=lambda x: x[1])

('Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) ',
 15)

In [53]:
title_map

{'Somewhere in Time (1980)': 'Somewhere in Time ',
 'Drop Dead Gorgeous (1999)': 'Drop Dead Gorgeous ',
 "Devil's Brigade, The (1968)": "Devil's Brigade, The ",
 'Prick Up Your Ears (1987)': 'Prick Up Your Ears ',
 'Algiers (1938)': 'Algiers ',
 'Sliver (1993)': 'Sliver ',
 'Thing, The (1982)': 'Thing, The ',
 'Smiling Fish and Goat on Fire (1999)': 'Smiling Fish and Goat on Fire ',
 'A Chef in Love (1996)': 'A Chef in Love ',
 'Year of the Horse (1997)': 'Year of the Horse ',
 'Paralyzing Fear: The Story of Polio in America, A (1998)': 'Paralyzing Fear: The Story of Polio in America, A ',
 'Fear of a Black Hat (1993)': 'Fear of a Black Hat ',
 'Mercury Rising (1998)': 'Mercury Rising ',
 'Detroit 9000 (1973)': 'Detroit 9000 ',
 'Morning After, The (1986)': 'Morning After, The ',
 'Fifth Element, The (1997)': 'Fifth Element, The ',
 'Shall We Dance? (Shall We Dansu?) (1996)': 'Shall We Dance? (Shall We Dansu?) ',
 "Pot O' Gold (1941)": "Pot O' Gold ",
 'This World, Then the Fireworks (

In [54]:
#电影类型转数字字典
genres_set = set()
for item in movies['Genres'].str.split('|'):
    genres_set.update(item)

genres_set.add('<PAD>')
genres2int = {item:index for index,item in enumerate(genres_set)}

In [55]:
genres2int

{'<PAD>': 8,
 'Action': 2,
 'Adventure': 1,
 'Animation': 11,
 "Children's": 15,
 'Comedy': 6,
 'Crime': 12,
 'Documentary': 10,
 'Drama': 9,
 'Fantasy': 0,
 'Film-Noir': 5,
 'Horror': 4,
 'Musical': 14,
 'Mystery': 7,
 'Romance': 17,
 'Sci-Fi': 18,
 'Thriller': 3,
 'War': 16,
 'Western': 13}

In [56]:
#将电影类型转成等长数字列表，长度是18
genres_map = {item:[genres2int[row] for row in item.split('|')] for index,item in enumerate(set(movies['Genres']))}

for key in genres_map:
    for cnt in range(max(genres2int.values()) - len(genres_map[key])):
        genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])

movies['Genres'] = movies['Genres'].map(genres_map)

In [57]:
#电影Title转数字字典
title_set = set()
for val in movies['Title'].str.split():
    title_set.update(val)

title_set.add('<PAD>')
title2int = {val:ii for ii, val in enumerate(title_set)}

In [58]:
title2int

{'Check': 0,
 'May': 1,
 'Crocodile': 2,
 'Apart': 3,
 '60': 4,
 'pastorale,': 5,
 'Avengers,': 6,
 'de': 7,
 'k�ldum': 8,
 '(Et': 9,
 'Lian)': 10,
 'Mating': 11,
 'Total': 12,
 'Greenwich': 13,
 'Cuts': 14,
 '(Saimt': 15,
 'Trip': 16,
 'Valiant': 17,
 'Cold': 18,
 'Chungking': 19,
 'Harriet': 20,
 'Storm,': 21,
 'Whatever': 22,
 'Behavior': 23,
 'Stop': 24,
 "Cat's": 25,
 'Pajama': 26,
 'Strikes': 27,
 'Team': 28,
 'Case,': 29,
 'Extremities': 30,
 'Sacrifice': 31,
 'Shane': 32,
 "Farmer's": 33,
 'Morning': 34,
 'Douce': 35,
 'Uninvited': 36,
 'Odyssey,': 37,
 'Enterprise': 38,
 '(Rosso': 39,
 'Dazed': 40,
 'Roma': 41,
 "Pete's": 42,
 'Couple,': 43,
 'Cider': 44,
 'Angela': 45,
 'Dollhouse': 46,
 'Drugstore': 47,
 'Committed': 48,
 'Ross': 49,
 'Gun,': 50,
 '3-D': 51,
 'Making': 52,
 "McHale's": 53,
 'Grape': 54,
 'Two': 55,
 'Secreto)': 56,
 'Maybe,': 57,
 'Tombstone': 58,
 'Sexual': 59,
 'Honey,': 60,
 'Lifeboat': 61,
 'Steam:': 62,
 'Soft': 63,
 'Guess': 64,
 'Brother,': 65,
 'Chin

In [59]:
#将电影Title转成等长数字列表，长度是15
title_count = 15
title_map = {val:[title2int[row] for row in val.split()] for ii,val in enumerate(set(movies['Title']))}

for key in title_map:
    for cnt in range(title_count - len(title_map[key])):
        title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])

movies['Title'] = movies['Title'].map(title_map)

In [60]:
movies['Title'][100]

[4115,
 4975,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982,
 4982]

In [61]:
movies_orig[100]

array([102, 'Mr. Wrong (1996)', 'Comedy'], dtype=object)

In [62]:
list(title2int.keys())[list(title2int.values()).index(2870)]

'Mission'

In [63]:
list(title2int.keys())[list(title2int.values()).index(3789)]

'Arizona'

In [64]:
list(title2int.keys())[list(title2int.values()).index(3476)]

'Chips'

In [65]:
def load_data():
    """
    Load Dataset from File
    """
    # 读取User数据
    users_title = ['UserID', 'Gender', 'Age', 'JobID', 'Zip-code']
    users = pd.read_table('./ml-1m/users.dat', sep='::', header=None, names=users_title, engine = 'python')
    users = users.filter(regex='UserID|Gender|Age|JobID')
    users_orig = users.values
    # 改变User数据中性别和年龄
    gender_map = {'F':0, 'M':1}
    users['Gender'] = users['Gender'].map(gender_map)

    age_map = {item:index for index,item in enumerate([1,18,25,35,45,50,56])}
    users['Age'] = users['Age'].map(age_map)

    # 读取Movie数据集
    movies_title = ['MovieID', 'Title', 'Genres']
    movies = pd.read_table('./ml-1m/movies.dat', sep='::', header=None, names=movies_title, engine = 'python')
    movies_orig = movies.values
    # 将Title中的年份去掉
    pattern = re.compile(r'^(.*)\((\d+)\)$')

    title_map = {item:pattern.match(item).group(1) for index,item in enumerate(set(movies['Title']))}
    movies['Title'] = movies['Title'].map(title_map)

    # 电影类型转数字字典
    genres_set = set()
    for item in movies['Genres'].str.split('|'):
        genres_set.update(item)

    genres_set.add('<PAD>')
    genres2int = {item:index for index,item in enumerate(genres_set)}

    # 将电影类型转成等长数字列表，长度是18
    genres_map = {item:[genres2int[row] for row in item.split('|')] for index,item in enumerate(set(movies['Genres']))}

    for key in genres_map:
        for cnt in range(max(genres2int.values()) - len(genres_map[key])):
            genres_map[key].insert(len(genres_map[key]) + cnt,genres2int['<PAD>'])
    
    movies['Genres'] = movies['Genres'].map(genres_map)

    # 电影Title转数字字典
    title_set = set()
    for val in movies['Title'].str.split():
        title_set.update(val)
    
    title_set.add('<PAD>')
    title2int = {val:ii for ii, val in enumerate(title_set)}

    # 将电影Title转成等长数字列表，长度是15
    title_count = 15
    title_map = {item:[title2int[row] for row in item.split()] for index,item in enumerate(set(movies['Title']))}
    
    for key in title_map:
        for cnt in range(title_count - len(title_map[key])):
            title_map[key].insert(len(title_map[key]) + cnt,title2int['<PAD>'])
    
    movies['Title'] = movies['Title'].map(title_map)

    # 读取评分数据集
    ratings_title = ['UserID','MovieID', 'ratings', 'timestamps']
    ratings = pd.read_table('./ml-1m/ratings.dat', sep='::', header=None, names=ratings_title, engine = 'python')
    ratings = ratings.filter(regex='UserID|MovieID|ratings')

    # 合并三个表
    data = pd.merge(pd.merge(ratings, users), movies)
    
    # 将数据分成X和y两张表
    target_fields = ['ratings']
    features_pd, targets_pd = data.drop(target_fields, axis=1), data[target_fields]
    
    features = features_pd.values
    targets_values = targets_pd.values
    
    return title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig

In [66]:
title_count, title_set, genres2int, features, targets_values, ratings, users, movies, data, movies_orig, users_orig = load_data()

In [67]:
data.head()

Unnamed: 0,UserID,MovieID,ratings,Gender,Age,JobID,Title,Genres
0,1,1193,5,0,0,10,"[2077, 3571, 3400, 1316, 379, 3314, 4982, 4982...","[9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."
1,2,1193,5,1,6,16,"[2077, 3571, 3400, 1316, 379, 3314, 4982, 4982...","[9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."
2,12,1193,4,1,2,12,"[2077, 3571, 3400, 1316, 379, 3314, 4982, 4982...","[9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."
3,15,1193,4,1,2,7,"[2077, 3571, 3400, 1316, 379, 3314, 4982, 4982...","[9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."
4,17,1193,5,1,5,1,"[2077, 3571, 3400, 1316, 379, 3314, 4982, 4982...","[9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, ..."


In [68]:
features

array([[1, 1193, 0, ..., 10,
        [2077, 3571, 3400, 1316, 379, 3314, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982],
        [9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]],
       [2, 1193, 1, ..., 16,
        [2077, 3571, 3400, 1316, 379, 3314, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982],
        [9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]],
       [12, 1193, 1, ..., 12,
        [2077, 3571, 3400, 1316, 379, 3314, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982],
        [9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]],
       ..., 
       [5780, 2845, 1, ..., 17,
        [4419, 2803, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982],
        [9, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]],
       [5851, 3607, 0, ..., 20,
        [2077, 474, 2453, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982, 4982],
        [6, 9, 13, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8]],
       [5938,

In [69]:
features.take(0,1)

array([1, 2, 12, ..., 5780, 5851, 5938], dtype=object)

In [None]:
# Count Plot (a.k.a. Bar Plot)
sns.countplot(x='Type 1', data=df, palette=None)
 
# Rotate x-labels
plt.xticks(rotation=-45)