In [None]:
# Reading the Train and Test Data
import pandas as pd
import matplotlib.pyplot as plt
% matplotlib inline
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')

In [None]:
# Splitting ts_listen date to listened_date and listened_time
train_data = train_data.join(train_data['ts_listen'].str.split(' ', 1, expand=True).rename(columns={0:'listened_date', 1:'listened_time'}))
test_data = test_data.join(test_data['ts_listen'].str.split(' ', 1, expand=True).rename(columns={0:'listened_date', 1:'listened_time'}))

In [None]:
# Formatting release date
from datetime import datetime
train_date = []
for item in train_data['release_date']:
    train_date.append( datetime.strptime(str(item), '%Y%m%d').strftime('%Y-%m-%d'))
    
test_date = []
for item in test_data['release_date']:
    test_date.append( datetime.strptime(str(item), '%Y%m%d').strftime('%Y-%m-%d'))
    
song_release_date = pd.Series(train_date)
train_data['song_release'] = song_release_date

song_release_date = pd.Series(test_date)
test_data['song_release'] = song_release_date    

In [None]:
# Finding difference between release date and listen date
def date_diff(a,b):
    diff = (datetime.strptime(a,"%Y-%m-%d") - datetime.strptime(b,"%Y-%m-%d")).days
    return(diff)
    
train_data['song_release_days'] = train_data.apply(lambda row : date_diff(row['listened_date'],row['song_release']),axis=1)
test_data['song_release_days'] = test_data.apply(lambda row : date_diff(row['listened_date'],row['song_release']),axis=1)

In [None]:
# Extracting hour from listened_time
def get_hour(a):
    h = (datetime.strptime(a,"%H:%M:%S")).hour
    return(h)
    
train_data['listened_hour'] = train_data.apply(lambda row : get_hour(row['listened_time']),axis=1)
test_data['listened_hour'] = test_data.apply(lambda row : get_hour(row['listened_time']),axis=1)

In [None]:
# Extracting year from listened_date
def get_year(a):
    y = (datetime.strptime(a,"%Y-%m-%d")).year
    return(y)
    
train_data['listened_year'] = train_data.apply(lambda row : get_year(row['listened_date']),axis=1)
test_data['listened_year'] = test_data.apply(lambda row : get_year(row['listened_date']),axis=1)

In [None]:
# Dropping the not required fields
train_data = train_data.drop(['ts_listen','listened_time','release_date','listened_date'],1)
test_data = test_data.drop(['ts_listen','listened_time','release_date','listened_date'],1)

In [None]:
# Allocating User Count
user_count = train_data.groupby("user_id").agg({ "user_id" : pd.Series.count})

def allot(id):
    if id not in user_count.index:
        count = 0
    else:
        count = user_count.loc[id]
    return(count)

train_data['user_count'] = train_data.apply(lambda row : allot(row['user_id']),axis=1)
test_data['user_count'] = test_data.apply(lambda row : allot(row['user_id']),axis=1)

In [None]:
# Allocating Unique Genre
unique_genre = train_data.groupby("user_id").agg({ "genre_id" : pd.Series.nunique})

def allot_genre(id):
    if id not in unique_genre.index:
        count = 0
    else:
        count = unique_genre.loc[id]
    return(count)

train_data['unique_genre'] = train_data.apply(lambda row : allot_genre(row['user_id']),axis=1)
test_data['unique_genre'] = test_data.apply(lambda row : allot_genre(row['user_id']),axis=1)

In [None]:
#Finding User's genre preference
genre = train_data[['genre_id','user_id','media_id']].groupby(['user_id','genre_id']).agg('count').reset_index()
genre_p = genre[['user_id','genre_id','media_id']].sort_values(by='media_id', ascending=False)
user_genre_preference = genre_p.groupby(['user_id']).head(1)
genre_preference_count = user_genre_preference.sort_values(by='user_id').reset_index()
#print(genre_preference_count)

In [None]:
# Alloting user's genre preference
def preferred_genre(id):
    if id not in genre_preference_count.index:
        count = 0
    else:
        count = genre_preference_count.loc[id]['genre_id']
    return (count)

train_data['preferred_genre'] = train_data.apply(lambda x : preferred_genre(x['user_id']),axis=1)
test_data['preferred_genre'] = test_data.apply(lambda x : preferred_genre(x['user_id']),axis=1)

In [None]:
#Finding User's artist preference
artist = train_data[['artist_id','user_id','media_id']].groupby(['user_id','artist_id']).agg('count').reset_index()
artist_p = artist[['user_id','artist_id','media_id']].sort_values(by='media_id', ascending=False)
#print(artist_p)
user_artist_preference = artist_p.groupby(['user_id']).head(1)
artist_preference_count = user_artist_preference.sort_values(by='user_id').reset_index()
#print(artist_preference_count)

In [None]:
# Alloting user's artist preference
def preferred_artist(id):
    if id not in artist_preference_count.index:
        count = 0
    else:
        count = artist_preference_count.loc[id]['artist_id']
    return (count)

train_data['preferred_artist'] = train_data.apply(lambda x : preferred_artist(x['user_id']),axis=1)
test_data['preferred_artist'] = test_data.apply(lambda x : preferred_artist(x['user_id']),axis=1)

In [None]:
#Finding User's platform preference
platform = train_data[['platform_family','user_id','media_id']].groupby(['user_id','platform_family']).agg('count').reset_index()
platform_p = platform[['user_id','platform_family','media_id']].sort_values(by='media_id', ascending=False)
user_platform_preference = platform_p.groupby(['user_id']).head(1)
platform_preference_count = user_platform_preference.sort_values(by='user_id').reset_index()

In [None]:
# Alloting user's platform preference
def preferred_platform(id):
    if id not in platform_preference_count.index:
        count = 0
    else:
        count = platform_preference_count.loc[id]['platform_family']
    return (count)

train_data['preferred_platform'] = train_data.apply(lambda x : preferred_platform(x['user_id']),axis=1)
test_data['preferred_platform'] = test_data.apply(lambda x : preferred_platform(x['user_id']),axis=1)

In [None]:
# Finding each user's listening count
listened_count = train_data.groupby("user_id").agg({ "is_listened" : pd.Series.sum})

In [None]:
# Alloting total songs listened by the user
def l_count(id):
    if id not in listened_count.index:
        count = 0
    else:
        count = listened_count.loc[id]
    return (count)

train_data['listened_count'] = train_data.apply(lambda x : l_count(x['user_id']),axis=1)
test_data['listened_count'] = test_data.apply(lambda x : l_count(x['user_id']),axis=1)
cv['listened_count'] = cv.apply(lambda x : l_count(x['user_id']),axis=1)v

In [None]:
# Preparing fields
X_train = train_data.drop(['is_listened'],1)
Y_train = train_data['is_listened']
X_test = test_data

In [None]:
# Applying model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100, max_depth=30)
model.fit(X_train,Y_train)

log_prob = model.predict_proba(X_test) 
print(log_prob)

In [None]:
X_test['is_listened'] = log_prob[:,1]
header = ["ID", "is_listened"]
X_test.to_csv('output.csv', columns = header)