In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train_df = pd.read_parquet('../input/predict-youtube-videos-likes/train.parquet')
test_df = pd.read_parquet('../input/predict-youtube-videos-likes/test.parquet')

In [3]:
train_df.head()

In [4]:
train_df.isna().sum()

In [5]:
test_df.isna().sum()

## Histogram of categoryIds

In [6]:
n, bins, patches = plt.hist(train_df.categoryId , bins = 30 )
plt.xticks( range(30), rotation=90)
plt.show()

In [7]:
n, bins, patches = plt.hist(test_df.categoryId , bins = 30 )
plt.xticks( range(30), rotation=90)
plt.show()

In [8]:
for ids in test_df.categoryId.unique():
    if ids not in train_df.categoryId.unique():
        print(ids)

In [9]:
for ids in train_df.categoryId.unique():
    if ids not in test_df.categoryId.unique():
        print(ids)

In [10]:
for ids in train_df.categoryId.unique():
    value = float("{:.2f}".format((train_df[train_df['categoryId'] == ids].shape[0] * 100) / train_df.shape[0]))
    print(ids , 'form ' , value , '% of train_df')

---
   *  All category ids in test_df are available in train_df
   * There is no sample from category ids 15 and 29 in test_df, but there are a few in train_df
   * train dataset is quite imbalanced
   
   ---

### Avg target in each categoryId

In [11]:
train_averages_category = train_df[train_df.ratings_disabled==0].groupby(by=['categoryId']).mean('target')['target'].to_dict()
train_df['avg_in_category'] = train_df.categoryId.apply(lambda x: train_averages_category[x])

sns.catplot(x="categoryId", y="target", data=train_df , kind = 'box')

In [12]:
train_df[train_df.ratings_disabled==0].groupby(by=['categoryId']).mean('target').target.sort_values(ascending = False)

---
CategoryId 29, 23, 10 with high target values

CategoryId 25,17 with low target values

---

In [13]:
#for test data map avg_in_category based on the avg of target for that category in train data
test_df['avg_in_category'] = test_df.categoryId.apply(lambda x: train_averages_category[x])

### Channel Titles

In [14]:
#for test data map avg_in_category based on the avg of target for that category in train data
test_df['target_avg_in_category'] = test_df.categoryId.apply(lambda x: train_averages_category[x])

In [15]:
test_channel_titles = test_df.channelTitle
train_channel_titles = train_df.channelTitle
title_intersection = list(set(test_channel_titles) & set(train_channel_titles))
len(title_intersection)

693 channels are common in both training and test set

---

In [16]:
train_averages_channel = train_df[train_df.ratings_disabled==0].groupby(by=['channelTitle']).mean('target')['target'].to_dict()
train_df['target_average_channel'] = [train_averages_channel[a] if a in train_averages_channel.keys() else 0 for a in train_df.channelTitle]

# for test data: if channel is in intersection -> use train avg channel. otherwise choose 0
test_df['target_average_channel'] = [train_averages_channel[a] if a in train_averages_channel.keys() else train_averages_category[b] for a,b in zip(test_df.channelTitle,test_df.categoryId)]
# test_df['target_average_channel'] = [train_averages_channel[a] if a in train_averages_channel.keys() else 0 for a in test_df.channelTitle]

In [17]:
train_df[train_df.ratings_disabled==0].groupby(by=['channelTitle']).mean('target')['target'].sort_values(ascending = True)[:10]

In [18]:
sorted_df = train_df[train_df.ratings_disabled==0]['target'].sort_values(ascending = False).index
titles_for_top_targets = train_df.iloc[sorted_df].title
titles_for_top_targets[:15]

It seems that the highest targets tend to have korean words in their title or description.

In [19]:
!pip install koNLP

In [20]:
# from koNLP import is_hangul


#Below is the is_hangul implementation using only regex from the notebook :
import regex
def is_hangul(value):
    if regex.search(r'\p{IsHangul}', value):
        return True
    return False

train_df['isKorean'] = train_df.title.apply(lambda x: is_hangul(x)).astype(int) | train_df.description.apply(lambda x: is_hangul(str(x))).astype(int)
test_df['isKorean'] = test_df.title.apply(lambda x: is_hangul(x)).astype(int) | test_df.description.apply(lambda x: is_hangul(str(x))).astype(int)


In [21]:
from collections import Counter

titles_in_top_targets = train_df.iloc[sorted_df].title[:1000].to_list()
words_in_top_targets = [word for title in titles_in_top_targets for word in title.split()]
count_words_in_top_targets = Counter(words_in_top_targets)
top_words_top_targets =  dict(sorted(count_words_in_top_targets.items(), key=lambda item: item[1] , reverse=True))

In [22]:
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

take(30 , top_words_top_targets)

Top non-stop words in the top targets are like: __Trailer, Teaser, Official, BTS, (korean BTS), other korean words__

In [23]:
train_df['isTeaser'] = train_df.title.apply(lambda x: 'teaser' in x.lower()).astype(int)
train_df['isBTS'] = train_df.title.apply(lambda x: 'bts'  in x.lower()).astype(int)
train_df['isTrailer'] = train_df.title.apply(lambda x: 'trailer' in x.lower()).astype(int)
train_df['isOfficial'] = train_df.title.apply(lambda x: 'official' in x.lower()).astype(int)

test_df['isTeaser'] = test_df.title.apply(lambda x: 'teaser' in x.lower()).astype(int)
test_df['isBTS'] = test_df.title.apply(lambda x: 'bts'  in x.lower()).astype(int)
test_df['isTrailer'] = test_df.title.apply(lambda x: 'trailer' in x.lower()).astype(int)
test_df['isOfficial'] = test_df.title.apply(lambda x: 'official' in x.lower()).astype(int)


In [24]:
days_of_week={'Monday':0, 'Tuesday':1, 'Wednesday':2, 'Thursday':3, 'Friday':4, 'Saturday':5, 'Sunday':6}

train_df['day_published'] = train_df.publishedAt.dt.dayofweek
test_df['day_published'] = test_df.publishedAt.dt.dayofweek

# train_df['DayofWeek_trending'] = train_df.trending_date.dt.dayofweek
# test_df['DayofWeek_trending'] = test_df.trending_date.dt.dayofweek


train_df['trending_date'] = pd.to_datetime(train_df['trending_date'],utc=True)
test_df['trending_date'] = pd.to_datetime(test_df['trending_date'],utc=True)

train_df['day_trending'] = train_df.trending_date.dt.dayofweek
test_df['day_trending'] = test_df.trending_date.dt.dayofweek

In [25]:
import seaborn as sns
sns.countplot(train_df['day_published'])

on Friday(4) and Sunday(6) : highest number of published videos

In [26]:
sns.countplot(train_df['day_trending'])

In [27]:
sns.boxplot(train_df['day_trending'] , train_df['target'] , showfliers = False)

In [28]:
train_df.groupby('day_trending').target.mean().plot()

It seems videos trending on friday have higher avg of target and thoughs trending on Thursday have lower avg target. Though the differece is not much so I dont expect it to have much of an impact.

In [29]:
train_df['is_Friday_Trending'] = [1 if day == 4 else 0 for day in train_df.day_trending]
test_df['is_Friday_Trending'] = [1 if day == 4 else 0 for day in test_df.day_trending]

train_df['is_Thursday_Trending'] = [1 if day == 3 else 0 for day in train_df.day_trending]
test_df['is_Thursday_Trending'] = [1 if day == 3 else 0 for day in test_df.day_trending]

In [30]:
sns.boxplot(train_df['day_published'] , train_df['target'] , showfliers = False)

In [31]:
train_df.groupby('day_published').target.mean().plot()

It seems videos published on friday have higher avg of target and thoughs published on Sunday have lower avg target. Though the differece is not much so I dont expect it to have much of an impact.

In [32]:
train_df['is_Friday_published'] = [1 if day == 4 else 0 for day in train_df.day_published]
test_df['is_Friday_published'] = [1 if day == 4 else 0 for day in test_df.day_published]


train_df['is_Sunday_published'] = [1 if day == 6 else 0 for day in train_df.day_published]
test_df['is_Sunday_published'] = [1 if day == 6 else 0 for day in test_df.day_published]

Let's check the effect of comment being disabled:

In [33]:
train_df.comments_disabled = train_df.comments_disabled.astype(int)
train_df.groupby('comments_disabled').target.mean()

Significant difference in mean of target! Seems those with comments enabled might end up with higher targets!

In [34]:
train_df.groupby('has_thumbnail').target.mean()

Whether to have or not have thubnail doesnt seem to have much effect.

Tags!
---

In [35]:
all_tags = train_df.tags.to_list()
words_tags = [tag for tags in all_tags for tag in tags.split('|')]
count_tags = Counter(words_tags)
# top_tags = dict(sorted(count_tags.items(), key=lambda item: item[1] , reverse=True))
top_tags = {key:value for (key,value) in count_tags.items() if value >=150}

def get_mean_target_for_tag(tag):
    train_df['temp'] = train_df.tags.apply(lambda x: tag in x.split('|')).astype(int)
    mean_target_for_tag = train_df[ train_df['temp'] == 1 ]['target'].mean()
    train_df.drop('temp' , inplace = True , axis = 1)
    return mean_target_for_tag

target_means ={}
for tag in top_tags:
    if tag not in target_means:
        target_means[tag] = []
    target_means[tag].append(get_mean_target_for_tag(tag))


In [36]:
def get_target_mean_for_all(tags):
    sum_target_means = 0
    for tag in tags.split('|'):
        if tag in target_means:
            sum_target_means += target_means[tag][0]
        else:
            sum_target_means += target_means['[None]'][0]
    return sum_target_means / len(tags.split('|'))
train_df['avg_target_of_tags'] = train_df.tags.apply(lambda x: get_target_mean_for_all(x))
test_df['avg_target_of_tags'] = test_df.tags.apply(lambda x: get_target_mean_for_all(x))

In [37]:
train_df['days_old']=(train_df['trending_date'].dt.date - train_df['publishedAt'].dt.date).dt.days
test_df['days_old']=(test_df['trending_date'].dt.date - test_df['publishedAt'].dt.date).dt.days



train_df['seconds_old'] = (train_df['trending_date'] - train_df['publishedAt']).dt.total_seconds().astype('int')
test_df['seconds_old'] = (test_df['trending_date'] - test_df['publishedAt']).dt.total_seconds().astype('int')

In [38]:
train_df['short_video'] = [1 if a<=180 else 0 for a in train_df.duration_seconds]
test_df['short_video'] = [1 if a<=180 else 0 for a in test_df.duration_seconds]

train_df['long_video'] = [1 if a>=70000 else 0 for a in train_df.duration_seconds]
test_df['long_video'] = [1 if a>=70000 else 0 for a in test_df.duration_seconds]

In [39]:
features = train_df[train_df['comments_disabled'] == 0][
    ['target_average_channel','avg_target_of_tags','avg_in_category','isKorean', \
          'day_published', 'day_trending','comments_disabled','has_thumbnail','days_old','seconds_old','is_Friday_Trending',
          'is_Thursday_Trending', 'is_Friday_published','is_Sunday_published',\
                                                         'duration_seconds'  , 'short_video' , 'long_video']]
# features = train_df[train_df['comments_disabled'] == 0][
#     ['target_average_channel','avg_target_of_tags','avg_in_category','isKorean', 'isTeaser','isBTS','isOfficial', 'isTrailer',\
#           'day_published', 'day_trending','comments_disabled','has_thumbnail','days_old','seconds_old','is_Friday_Trending',
#           'is_Thursday_Trending', 'is_Friday_published','is_Sunday_published',\
#                                                           'short_video' , 'long_video']]
features.shape

In [40]:
targets = train_df[train_df['comments_disabled'] == 0]['target']

In [41]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(features,targets,random_state=42,test_size=0.20)


In [42]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error,mean_absolute_error


xgb = xgb.XGBRegressor()
xgb.fit(X_train, y_train,
        eval_set=[(X_train,y_train)\
                  ,(X_val, y_val)],
        early_stopping_rounds=25,
       verbose=False)
predictions = xgb.predict(X_val)


In [43]:
from xgboost import plot_importance
plot_importance(xgb)

In [44]:
from sklearn.metrics import mean_squared_error,mean_absolute_error
X_val['predictions'] = predictions 
mean_absolute_error(y_val,X_val['predictions'])

In [45]:
import lightgbm as lgb
lgb = lgb.LGBMRegressor(n_estimators=1000,boosting_type='dart',learning_rate=0.2)
lgb.fit(X_train, y_train)
X_val.drop('predictions' , axis = 1 , inplace = True)
X_val['predictions'] = lgb.predict(X_val)
mean_absolute_error(y_val,X_val.predictions)

In [46]:
test_predictions = xgb.predict(test_df[ ['target_average_channel','avg_target_of_tags','avg_in_category','isKorean', \
          'day_published', 'day_trending','comments_disabled','has_thumbnail','days_old','seconds_old','is_Friday_Trending',
          'is_Thursday_Trending', 'is_Friday_published','is_Sunday_published',\
                                                         'duration_seconds' , 'short_video' , 'long_video']])

In [47]:
test_df['target'] = test_predictions
test_df['target'] =[a if b == 0 else 0 for a,b in zip(test_df.target,test_df.ratings_disabled)]
output = test_df[['id','target']]
output.to_csv('submission.csv',index=False)

