In [152]:
import numpy as np
import pandas as pd
from collections import Counter
import  matplotlib.pyplot as plt

In [109]:
df = pd.read_csv('../input/youtube-trending-video-dataset/IN_youtube_trending_data.csv', parse_dates=['publishedAt','trending_date'], infer_datetime_format=True)
df['title'] = df['title'].str.split('|').map(lambda x: x[0])
df.shape

In [110]:
df.head(3)

In [111]:
df.dtypes

In [112]:
df = df.sort_values('trending_date').reset_index(drop=True)
df = df.drop_duplicates(['trending_date', 'title'], keep='last').reset_index(drop=True)

In [113]:
tags = ['tamil', 'kannada', 'malayalam', 'telugu', 'hindi', 'teaser', 'trailer']

In [114]:
dfs = {}
for tag in tags:
    dfs[tag] = df.iloc[df['tags'].str.lower().str.contains(tag).values, :].reset_index(drop=True)

In [115]:
temp = {i: j.shape[0] for i, j in dfs.items()}
pd.DataFrame({'language': temp.keys(), 'count': temp.values()}).set_index('language').plot.pie(y='count', figsize=(10, 10), title='distribution of trending videos in different categories')

## Channels with most engagement

In [116]:
df['channelTitle'].value_counts().head(15).plot.pie(figsize=(10, 10), title='channels with most engagement')

# channels which have most engagement in Tamil

In [117]:
dfs['tamil']['channelTitle'].value_counts().head(15).plot.pie(figsize=(10, 10), title='channels with most engagement in tamil')

### trending videos based on view count

In [118]:
temp = dfs['tamil'].loc[dfs['tamil']['channelTitle']=='Vijay Television', 
                     'title,likes,view_count'.split(',')]
temp.groupby('title').agg('sum').sort_values('likes,view_count'.split(','), ascending=False).head(20).plot.pie(subplots=True, legend=False, figsize=(25, 10), title='Vijay tv analysis')

In [128]:
dfs['trailer']['title,view_count,likes'.split(',')].sort_values(
    'view_count', ascending=False).drop_duplicates('title', keep='last').head(15).set_index('title').plot.pie(subplots=True, figsize=(25,20), legend=False)

In [129]:
dfs['teaser']['title,view_count,likes'.split(',')].sort_values(
    'view_count', ascending=False).drop_duplicates('title', keep='last').head(15).set_index('title').plot.pie(subplots=True, figsize=(25,20), legend=False)

In [76]:
temp = df['channelTitle,likes,view_count,comment_count'.split(',')].drop_duplicates(keep='last').groupby('channelTitle').agg('sum').reset_index()
like_ratio = temp['likes']/temp['view_count']
comment_ratio = temp['comment_count']/temp['view_count']
temp['impact_ratio'] = np.where(like_ratio > comment_ratio, like_ratio, comment_ratio)

In [79]:
temp.sort_values('impact_ratio').head(10).reset_index(drop=True)

In [78]:
temp.sort_values('impact_ratio', ascending=False).head(20).reset_index(drop=True)

In [149]:
temp = df['video_id'].value_counts().head(10).index.tolist()
temp = df.loc[df['video_id'].isin(temp), 'title,view_count,likes'.split(',')]

In [161]:
fig, axes = plt.subplots(5, 2, figsize=(25, 20))
for t, ax in zip(temp['title'].unique(), axes.ravel()):
    temp.loc[temp['title']==t,'view_count,likes'.split(',')].plot.line(ax=ax)
    ax.set_title(t)
    ax.axis('off')