## emi music data set

## predict how much user will like a new song

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%pylab inline

from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split, KFold, cross_val_score
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, f_regression
import xgboost as xgb

np.random.seed(221)

Populating the interactive namespace from numpy and matplotlib




In [2]:
# load files
print("Reading Data....")
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
users = pd.read_csv("users.csv")
words =pd.read_csv("words.csv", encoding='ISO-8859-1')
print ("Reading Data Completed")

Reading Data....
Reading Data Completed


In [3]:
def get_missing_value_features(df):
    missing_val_df = df.isnull().any()
    return list(missing_val_df[missing_val_df == True].index)

In [4]:
# fill missin values
words = words.fillna(-999)

In [5]:
def prepare_count_features(df):
    features = ['HEARD_OF', 'OWN_ARTIST_MUSIC']
    
    for feature in features:
        df[feature] = df[feature].fillna('')
        feature_dict = df[feature].value_counts().to_dict()
        df[feature] = df[feature].map(lambda x: feature_dict[x])
    
    return df

words = prepare_count_features(words)

### User Features

In [6]:
# Which of the following features have missing value ?
features_with_missing_values = get_missing_value_features(users)
print('Features with missing values ', features_with_missing_values)

Features with missing values  ['AGE', 'WORKING', 'REGION', 'LIST_OWN', 'LIST_BACK', 'Q16', 'Q18', 'Q19']


In [7]:
def fill_missing_values(df, missing_features):
    for feature in missing_features:
        if feature in ['AGE', 'Q16', 'Q18', 'Q19']:
            df[feature] = df[feature].fillna(-999) # to denote that this is a missing value
        else:
            df[feature] = df[feature].fillna('') # empty string to denote missing value for categorical feature.
    
    return df

users = fill_missing_values(users, features_with_missing_values)

In [8]:
def parse_music_pref(df, feature_name):
    return df[feature_name].str.findall(r'\d+').map(lambda x: 0 if len(x) == 0 else x[0])

users['LIST_OWN'] = parse_music_pref(users, 'LIST_OWN')
users['LIST_BACK'] = parse_music_pref(users, 'LIST_BACK')

In [9]:
def encode_features(df, feature_names):
    for feature in feature_names:
        lbl = LabelEncoder()
        lbl.fit(df[feature])
        
        df[feature] = lbl.transform(df[feature])
    
    return df

users = encode_features(users, ['GENDER', 'WORKING', 'REGION', 'MUSIC'])

In [10]:
users.head()

Unnamed: 0,RESPID,GENDER,AGE,WORKING,REGION,MUSIC,LIST_OWN,LIST_BACK,Q1,Q2,...,Q10,Q11,Q12,Q13,Q14,Q15,Q16,Q17,Q18,Q19
0,36927,0,60.0,7,6,2,1,0,49.0,50.0,...,50.0,50.0,71.0,52.0,71.0,9.0,7.0,72.0,49.0,26.0
1,3566,0,36.0,4,6,2,1,1,55.0,55.0,...,12.0,65.0,65.0,80.0,79.0,51.0,31.0,68.0,54.0,33.0
2,20054,0,52.0,1,2,0,1,0,11.0,50.0,...,50.0,94.0,51.0,74.0,66.0,27.0,46.0,73.0,8.0,31.0
3,41749,0,40.0,2,6,5,2,3,81.0,80.0,...,76.0,74.0,64.0,73.0,85.0,61.0,77.0,76.0,78.0,88.0
4,23108,0,16.0,5,3,5,3,6,76.0,79.0,...,56.0,13.0,82.0,79.0,68.0,71.0,-999.0,86.0,80.0,32.0


In [11]:
words.head()

Unnamed: 0,Artist,User,HEARD_OF,OWN_ARTIST_MUSIC,LIKE_ARTIST,Uninspired,Sophisticated,Aggressive,Edgy,Sociable,...,Unoriginal,Dated,Iconic,Unapproachable,Classic,Playful,Arrogant,Warm,Soulful,Unnamed: 87
0,47,45969,22878,84794,-999.0,-999.0,0.0,-999.0,0,0.0,...,-999.0,0.0,-999.0,-999.0,0.0,-999.0,-999.0,0,0.0,-999.0
1,35,29118,61892,84794,-999.0,0.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
2,14,31544,22878,84794,-999.0,0.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
3,23,18085,61892,84794,-999.0,-999.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0
4,23,18084,61892,84794,-999.0,-999.0,-999.0,0.0,0,-999.0,...,0.0,0.0,-999.0,0.0,0.0,0.0,0.0,0,-999.0,-999.0


### Feature Engineering

** Feature List **

** Features Based on the user. **

* Gender
* Age
* Music
* REGION
* LIST_OWN
* LIST_BACK
* Response to different questions


** Features Based on the artist. **

* Heard_Of
* Own_artist_music
* Like Artist
* Characteristics of the songs sung by the artist ( Edgy, Uninspired etc. )

** Features Based on the pair of user and artist. **

* Mean Artist Rating
* Min Artist Rating
* Max Artist Rating
* Median Artist Rating
* Mean User Rating
* Min User Rating
* Max User Rating
* Median User Rating
* Mean Rating given to Artist by a User
* Min Rating given to Artist by a User
* Max Rating given to Artist by a User
* Median Rating given to Artist by a User

In [12]:
print('Number of users that are present in both training and test set ', len(set(train.User) & set(test.User)))

Number of users that are present in both training and test set  44643


In [13]:
print('Number of artists that are present in both training and test set ', len(set(train.Artist) & set(test.Artist)))

Number of artists that are present in both training and test set  50


** Merge Data with words and users data frames. **

In [14]:
train_merged = pd.merge(train, words, left_on=['Artist', 'User'], right_on=['Artist', 'User'], how='left')
train_merged = pd.merge(train_merged, users, left_on=['User'], right_on=['RESPID'], how='left')

test_merged = pd.merge(test, words, left_on=['Artist', 'User'], right_on=['Artist', 'User'], how='left')
test_merged = pd.merge(test_merged, users, left_on=['User'], right_on=['RESPID'], how='left')

In [15]:
# group by user
user_group = train.groupby('User')

# group by artist
artist_group = train.groupby('Artist')

# mean user rating based on the training set
mean_user_ratings = user_group['Rating'].mean().to_dict()

# min user rating based on the training set
min_user_ratings = user_group['Rating'].min().to_dict()

# max user rating based on the training set
max_user_ratings = user_group['Rating'].max().to_dict()

# median user rating based on the training set
median_user_ratings = user_group['Rating'].median().to_dict()


# mean artist rating based on the training set
mean_artist_ratings = artist_group['Rating'].mean().to_dict()

# min artist rating based on the training set
min_artist_ratings = artist_group['Rating'].min().to_dict()

# max artist rating based on the training set
max_artist_ratings = artist_group['Rating'].max().to_dict()

# median artist rating based on the training set
median_artist_ratings = artist_group['Rating'].median().to_dict()

train_merged['mean_user_rating'] = train_merged.User.map(lambda x: mean_user_ratings[x] if x in mean_user_ratings else -999)
test_merged['mean_user_rating'] = test_merged.User.map(lambda x: mean_user_ratings[x] if x in mean_user_ratings else -999)

train_merged['min_user_rating'] = train_merged.User.map(lambda x: min_user_ratings[x] if x in min_user_ratings else -999)
test_merged['min_user_rating'] = test_merged.User.map(lambda x: min_user_ratings[x] if x in min_user_ratings else -999)

train_merged['max_user_rating'] = train_merged.User.map(lambda x: max_user_ratings[x] if x in max_user_ratings else -999)
test_merged['max_user_rating'] = test_merged.User.map(lambda x: max_user_ratings[x] if x in max_user_ratings else -999)

train_merged['median_user_rating'] = train_merged.User.map(lambda x: median_user_ratings[x] if x in median_user_ratings else -999)
test_merged['median_user_rating'] = test_merged.User.map(lambda x: median_user_ratings[x] if x in median_user_ratings else -999)

train_merged['mean_artist_rating'] = train_merged.Artist.map(lambda x: mean_artist_ratings[x] if x in mean_artist_ratings else -999)
test_merged['mean_artist_rating'] = test_merged.Artist.map(lambda x: mean_artist_ratings[x] if x in mean_artist_ratings else -999)

train_merged['min_artist_rating'] = train_merged.Artist.map(lambda x: min_artist_ratings[x] if x in min_artist_ratings else -999)
test_merged['min_artist_rating'] = test_merged.Artist.map(lambda x: min_artist_ratings[x] if x in min_artist_ratings else -999)

train_merged['max_artist_rating'] = train_merged.Artist.map(lambda x: max_artist_ratings[x] if x in max_artist_ratings else -999)
test_merged['max_artist_rating'] = test_merged.Artist.map(lambda x: max_artist_ratings[x] if x in max_artist_ratings else -999)

train_merged['median_artist_rating'] = train_merged.Artist.map(lambda x: median_artist_ratings[x] if x in median_artist_ratings else -999)
test_merged['median_artist_rating'] = test_merged.Artist.map(lambda x: median_artist_ratings[x] if x in median_artist_ratings else -999)

In [16]:
train_merged = train_merged.fillna(-999)
test_merged = test_merged.fillna(-999)

In [17]:
train_merged['LIST_OWN'] = train_merged.LIST_OWN.astype(int)
test_merged['LIST_OWN'] = test_merged.LIST_OWN.astype(int)

train_merged['LIST_BACK'] = train_merged.LIST_BACK.astype(int)
test_merged['LIST_BACK'] = test_merged.LIST_BACK.astype(int)

In [18]:
features = train_merged.columns.drop(['RESPID', 'Artist', 'User', 'Rating'])

X = train_merged[features]
y = train_merged.Rating

final_test = test_merged[features]

** Split data set. **

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1212)

In [20]:
X_train.shape, X_test.shape

((150952, 122), (37738, 122))

** Cross Validation Scheme **

In [21]:
def rmse(y_true, y_preds):
    return np.sqrt(mean_squared_error(y_true, y_preds))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [22]:
select = SelectKBest(f_regression, k=50)
# est = RandomForestRegressor()
est = xgb.XGBRegressor(n_estimators=100)
pipeline = Pipeline([('select', select), ('est', est)])
cv = KFold(len(y_train), n_folds=3, shuffle=True, random_state=1233)

cv_scores = cross_val_score(pipeline, X_train, y_train, scoring=rmse_scorer, cv=cv, n_jobs=-1)

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)
  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [23]:
print('Cross Validation scores: ', cv_scores)

Cross Validation scores:  [-12.61690859 -12.54917852 -12.49805519]


In [None]:
pipeline.fit(X_train, y_train)
y_preds = pipeline.predict(X_test)
print('RMSE on unseen examples: %f'%(rmse(y_test, y_preds)))

  corr /= row_norms(X.T)
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


RMSE on unseen examples: 12.609792


** Notes **

Add the following features

* Mean (artist, user) rating
* Min (artist, user) rating
* Max (artist, user) rating
* Median (artist, user) rating

In [1]:
from sklearn.naive_bayes import GaussianNB
#Create a Gaussian Classifier
model = GaussianNB()

# Train the model using the training sets 
model.fit(X_train, y_train)

pred= model.predict(X_test)


NameError: name 'X_train' is not defined

In [2]:

count = 0;
for i in range (0,1000):
    
    if pred[i]==actuallabel[i]:
        count = count + 1
        
    else:
        count = count + 0
    
print("accuracy =", (count/1000)*100)

NameError: name 'pred' is not defined