In [5]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

In [6]:
users = pd.read_csv('data/train.csv', names = ['item', 'user_id', 'rating'])

In [7]:
train = users[users.rating != '?']
test = users[users.rating == '?']
test.drop(['rating'], inplace = True, axis = 1)
print train.shape, test.shape
train.head()

(6311487, 3) (2103828, 2)


Unnamed: 0,item,user_id,rating
0,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A079789010EVSPIBCSWFO,6
1,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A13U02TNYRFNOI,6
2,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1QYORNO0GY308,5
3,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A1SUU1QIRDZXJC,1
4,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A2ELH6CUC5Y8J4,3


In [6]:
# unique users
len(np.unique(users.user_id))

9923

In [11]:
# unique items
len(np.unique(users.item))

393522

### Lets Check out the "Bad Data"

In [13]:
bad_data = users[users.rating.isnull()]
print 'number of bad data', len(bad_data)
bad_data.head()

bad data 17067


Unnamed: 0,item,user_id,rating
8285821,notfunny10,A1ENHC1UQ80L8X,
8285823,notfunny10,A1EU25HIF6NX3E,
8285832,notfunny10,A1JK07FS8SG422,
8285866,notfunny10,A1VGV9T3P70QK7,
8285869,notfunny10,A1XGSYVGKSVGLC,


In [15]:
bad_data.item.value_counts()[:10]

notfunny61    482
notfunny54    441
notfunny27    402
notfunny64    399
notfunny44    395
notfunny56    384
notfunny63    372
notfunny52    372
notfunny28    366
notfunny51    365
Name: item, dtype: int64

In [4]:
train.rating.value_counts()

6     1571309
7     1177383
5      884021
4      685633
3      664803
8      613266
9      246762
2      219274
1      163605
10      68360
1;          1
1o          1
1\          1
4.          1
Name: rating, dtype: int64

### Clean Rating Column

In [5]:
# clean rating 
train = train[train.rating != '1;']
train = train[train.rating != '1o']
train = train[train.rating != '1\\']
train = train[train.rating != '4.']
train.dropna(inplace = 'True')
train.rating = train.rating.astype(int)

### Clean Up `keywords-Aug2015.txt` to make it nicely formated 

In [7]:
# Get all feature names

with open('data/keywords-Aug2015.txt', 'r') as myfile:
    meta=myfile.read().replace('\n', '')
    
column_names = []

b = meta.split()
for i in b:
    c = i.split('=')
    if c[0] not in column_names:
        column_names.append(c[0])
        
print column_names

['hash', 'movie_rating', 'animals', 'cute', 'family', 'food', 'religion', 'witty', 'irony', 'dirty', 'language', 'nerdy', 'popculture', 'pun', 'technology', 'explicit(language)', 'fail', 'kids', 'money', 'school', 'work', 'dark', 'explicit(graphic)', 'health', 'relationship', 'drugs/alcohol', 'history', 'slapstick', 'math', 'political', 'racial', 'music', 'sports', 'science']


In [8]:
# Create new dataFrame

data = pd.read_table('data/keywords-Aug2015.txt', header = None, names = ['info'])

def create_df(row, feature):
    info = row.split()
    for i in info:
        if feature in i:
            h = i.split('=')
            return h[1]
    else:
        return 0
    
for column_name in column_names:
    data[column_name] = data['info'].map(lambda x: (create_df(x, column_name)))

data.drop(['info'], axis = 1, inplace = True)
data.head()

Unnamed: 0,hash,movie_rating,animals,cute,family,food,religion,witty,irony,dirty,...,relationship,drugs/alcohol,history,slapstick,math,political,racial,music,sports,science
0,3523919183160026723,1.0,0.83,1.0,0.16,0.16,0.5,0.16,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,14837447423332555806,1.5,0.83,0.16,0.0,0.0,0.0,0.16,0.33,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1805930636053085369,2.0,0.0,0.0,0.0,0.0,0.0,0.16,0.0,0.5,...,0,0,0,0,0,0,0,0,0,0
3,a3a5fd509ad51ba1d681bff1e38b59eaf4e5c019,1.6,0.0,0.0,0.0,0.0,0.0,0.0,0.33,0.0,...,0,0,0,0,0,0,0,0,0,0
4,18000053268349632107,2.8,0.0,0.0,0.0,0.0,0.0,0.2,0.4,0.0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
data.shape

(73247, 34)

### Save New DataSets to CSV

In [10]:
data.to_csv('data/new_keywords.csv', index = False, header = None)
train.to_csv('data/new_train.csv', index = False)
test.to_csv('data/new_test.csv', index = False)

### To Int Data - Needed for Spark

In [None]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

df = pd.read_csv('data/new_train.csv')

le = LabelEncoder()

item_int = le.fit_transform(df.item)
user_int = le.fit_transform(df.user_id)

df['item_int'] = item_int
df['user_int'] = user_int

df.drop(['item', 'user_id'], axis = 1, inplace = True)
df = df[['user_int', 'item_int', 'rating']]
df.to_csv('data/new_int_train.csv', index = False)

In [10]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd

# df = pd.read_csv('data/new_train.csv')

le = LabelEncoder()

item_int = le.fit_transform(test.item)
user_int = le.fit_transform(test.user_id)

test['item_int'] = item_int
test['user_int'] = user_int

# df.drop(['item', 'user_id'], axis = 1, inplace = True)
# df = df[['user_int', 'item_int', 'rating']]
test.to_csv('data/new_test_int_and_original.csv', index = False)
test.head()

Unnamed: 0,item,user_id,item_int,user_int
9,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3SLPTXAYO9RVW,0,7255
10,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,A3UY0YF90X0XLI,0,7434
13,00009a90ce7ec2b3e52bbffe7388dbc5850b3a6a,AAT1C3DBIYJRX,0,8009
16,000150213d9711862314abfa413efebcbe3339bf,A18T7E73TNGOKP,1,639
22,000150213d9711862314abfa413efebcbe3339bf,A23TDYCT5HS5XV,1,2828


In [9]:
test.shape

(2103828, 2)

In [None]:
2103828
6294717