In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings('ignore')

DATA_DIR = '../data/raw/'

In [2]:
# load files
train = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
test = pd.read_csv(os.path.join(DATA_DIR, 'test.csv'))
words = pd.read_csv(os.path.join(DATA_DIR, 'words.csv'), encoding='ISO-8859-1')
users = pd.read_csv(os.path.join(DATA_DIR, 'users.csv'))

In [3]:
def get_missing_value_features(df):
    missing_val_df = df.isnull().any()
    return list(missing_val_df[missing_val_df == True].index)

In [4]:
# fill missin values
words = words.fillna(-999)

In [5]:
def prepare_count_features(df):
    features = ['HEARD_OF', 'OWN_ARTIST_MUSIC']
    
    for feature in features:
        df[feature] = df[feature].fillna('')
        feature_dict = df[feature].value_counts().to_dict()
        df[feature] = df[feature].map(lambda x: feature_dict[x])
    
    return df

words = prepare_count_features(words)

### User Features

In [6]:
# Which of the following features have missing value ?
features_with_missing_values = get_missing_value_features(users)
print('Features with missing values ', features_with_missing_values)

Features with missing values  ['AGE', 'WORKING', 'REGION', 'LIST_OWN', 'LIST_BACK', 'Q16', 'Q18', 'Q19']


In [7]:
def fill_missing_values(df, missing_features):
    for feature in missing_features:
        if feature in ['AGE', 'Q16', 'Q18', 'Q19']:
            df[feature] = df[feature].fillna(-999) # to denote that this is a missing value
        else:
            df[feature] = df[feature].fillna('') # empty string to denote missing value for categorical feature.
    
    return df

users = fill_missing_values(users, features_with_missing_values)

In [8]:
def parse_music_pref(df, feature_name):
    return df[feature_name].str.findall(r'\d+').map(lambda x: 0 if len(x) == 0 else x[0])

users['LIST_OWN'] = parse_music_pref(users, 'LIST_OWN')
users['LIST_BACK'] = parse_music_pref(users, 'LIST_BACK')

In [9]:
def encode_features(df, feature_names):
    for feature in feature_names:
        lbl = LabelEncoder()
        lbl.fit(df[feature])
        
        df[feature] = lbl.transform(df[feature])
    
    return df

users = encode_features(users, ['GENDER', 'WORKING', 'REGION', 'MUSIC'])

In [10]:
users.head()

Unnamed: 0,RESPID,GENDER,AGE,WORKING,REGION,MUSIC,LIST_OWN,LIST_BACK,Q1,Q2,...,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19
0,36927,0,60.0,7,6,2,1,0,49.0,50.0,...,50.0,50.0,71.0,52.0,71.0,9.0,7.0,72.0,49.0,26.0
1,3566,0,36.0,4,6,2,1,1,55.0,55.0,...,12.0,65.0,65.0,80.0,79.0,51.0,31.0,68.0,54.0,33.0
2,20054,0,52.0,1,2,0,1,0,11.0,50.0,...,50.0,94.0,51.0,74.0,66.0,27.0,46.0,73.0,8.0,31.0
3,41749,0,40.0,2,6,5,2,3,81.0,80.0,...,76.0,74.0,64.0,73.0,85.0,61.0,77.0,76.0,78.0,88.0
4,23108,0,16.0,5,3,5,3,6,76.0,79.0,...,56.0,13.0,82.0,79.0,68.0,71.0,-999.0,86.0,80.0,32.0


In [11]:
words.head()

Unnamed: 0,Artist,User,HEARD_OF,OWN_ARTIST_MUSIC,LIKE_ARTIST,Uninspired,Sophisticated,Aggressive,Edgy,Sociable,...,Unoriginal,Dated,Iconic,Unapproachable,Classic,Playful,Arrogant,Warm,Soulful,Unnamed: 87
0,47,45969,22878,84794,-999.0,-999.0,0.0,-999.0,0,0.0,...,-999.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,0,0.0,-999.0
1,35,29118,61892,84794,-999.0,0.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
2,14,31544,22878,84794,-999.0,0.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
3,23,18085,61892,84794,-999.0,-999.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
4,23,18084,61892,84794,-999.0,-999.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0


### Feature Engineering

** Feature List **

** Features Based on the user. **

* Gender
* Age
* Music
* REGION
* LIST_OWN
* LIST_BACK
* Response to different questions


** Features Based on the artist. **

* Heard_Of
* Own_artist_music
* Like Artist
* Characteristics of the songs sung by the artist ( Edgy, Uninspired etc. )

** Features Based on the pair of user and artist. **

* Mean Artist Rating
* Min Artist Rating
* Max Artist Rating
* Median Artist Rating
* Mean User Rating
* Min User Rating
* Max User Rating
* Median User Rating
* Mean Rating given to Artist by a User
* Min Rating given to Artist by a User
* Max Rating given to Artist by a User
* Median Rating given to Artist by a User

In [2]:
train.head()

NameError: name 'train' is not defined