In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.float_format', lambda x: '%.5f' % x)

In [3]:
df = pd.read_csv("/kaggle/input/youtube-new/USvideos.csv")
df.head()

In [4]:
#Processing the data
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m')
df['trending_date'].head()

In [5]:
df['publish_time'] = pd.to_datetime(df['publish_time'], format='%Y-%m-%dT%H:%M:%S.%fZ')
df['publish_time'].head()

In [6]:
df.info()

In [7]:
df.insert(4, 'publish_date', df['publish_time'].dt.date)
df['publish_time'] = df['publish_time'].dt.time

In [8]:
df[['publish_date', 'publish_time']].head()

In [9]:
# Processing data types
type_int_list = ['views', 'likes', 'dislikes', 'comment_count']
for column in type_int_list:
    df[column] = df[column].astype(int)

In [10]:
type_str_list = ['category_id']
for column in type_str_list:
    df[column] = df[column].astype(str)

In [11]:
df.info()

In [12]:
#Importing json file and adding category column
import json
# creates a dictionary that maps `category_id` to `category`
id_to_category = {}

with open('../input/youtube-new/US_category_id.json', 'r') as f:
    data = json.load(f)
    for category in data['items']:
        id_to_category[category['id']] = category['snippet']['title']
id_to_category

In [13]:
df.insert(4, 'category', df['category_id'].map(id_to_category))
df[['category_id', 'category']].head()

In [14]:
df.head()

In [15]:
#Correlation analysis between variables
special_columns = ['views', 'likes', 'dislikes', 'comment_count'] 
corr_matrix = df[special_columns].corr()
corr_matrix

In [16]:
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm

fig, ax = plt.subplots()
heatmap = ax.imshow(corr_matrix, interpolation='nearest', cmap=cm.coolwarm)

# making the colorbar on the side
cbar_min = corr_matrix.min().min()
cbar_max = corr_matrix.max().max()
cbar = fig.colorbar(heatmap, ticks=[cbar_min, cbar_max])
# making the labels
labels = ['']
for column in special_columns:
    labels.append(column)
    labels.append('')
ax.set_yticklabels(labels, minor=False)
ax.set_xticklabels(labels, minor=False)

plt.show()

In [17]:
# Sorting and Measurements of Video Title and Category
#Create new dataset
df_new = df[["video_id","title","category","views","likes","dislikes","comment_count"]]

In [19]:
df_new.head()

In [20]:
df_new.describe().T

In [21]:
df_new.describe(percentiles=[.05,.25,.5,.75,.95]).round(1)

In [22]:
# Sorting by views
df_new.sort_values("views",ascending=False).head(20)

In [23]:
df_new["title"].value_counts().head(10)

In [24]:
# Create New dataset by grouping
df_un_title_max = df_new.groupby("title").agg({"views":"max","likes":"max","dislikes":"max","comment_count":"max"})
df_un_title_max.sort_values("views",ascending=False).head(25)

In [29]:
#Scale Variables and Standardization
from sklearn.preprocessing import MinMaxScaler
df_un_title_max["views_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_un_title_max[["views"]]).transform(df_un_title_max[["views"]])
df_un_title_max["likes_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_un_title_max[["likes"]]).transform(df_un_title_max[["likes"]])
df_un_title_max["dislikes_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_un_title_max[["dislikes"]]).transform(df_un_title_max[["dislikes"]])
df_un_title_max["comment_count_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_un_title_max[["comment_count"]]).transform(df_un_title_max[["comment_count"]])

In [30]:
# Function of Weighted Sorting Score
# c : comment_count
# v : views
# l : likes
# dl : dislikes

# Formula : v * c * (l / (l + dl)

def weighted_sorting(dataframe,c,v,l,dl):
    return (dataframe[v]* dataframe[c]* ((dataframe[l]/ (dataframe[l]+dataframe[dl]))))

In [31]:
df_un_title_max["weighted_sorting_score"] = weighted_sorting(df_un_title_max,'comment_count_scaled','views_scaled','likes_scaled','dislikes_scaled')

In [32]:
df_un_title_max.sort_values("weighted_sorting_score",ascending=False).head(25)

In [33]:
df_un_title_max.sort_values("weighted_sorting_score",ascending=False).head(25)

In [34]:
df_un_title_max = df_un_title_max.drop(["views","likes","dislikes","comment_count"],axis=1)

In [35]:
df_un_title_max.sort_values("weighted_sorting_score",ascending=False).head(20)

In [36]:
#When we review the list, the “So Sorry.” and "we broke up" video titles are on the list but it was not available in the first list.
df_un_title_max1 = df_un_title_max.reset_index()

In [38]:
# Visualization for top video titles
import seaborn as sns; sns.set_theme()
plt.figure(figsize=(20,10));
sns.barplot(x='weighted_sorting_score', 
            y="title", 
            data=df_un_title_max1, 
            order=df_un_title_max1.sort_values('weighted_sorting_score',ascending = False).title.head(20))

plt.xlabel("Weighted Sorting Score", size=15)
plt.ylabel("Video Title", size=15)
plt.title("Top 20 Video Titles on Youtube", size=18)

In [39]:
# Category Sorting by Weighted score
df_cat = df.groupby(["category"]).agg({"views":"sum","likes":"sum","dislikes":"sum","comment_count":"sum"})
df_cat.sort_values("views",ascending=False)

In [40]:
df_cat["views_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_cat[["views"]]).transform(df_cat[["views"]])
df_cat["likes_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_cat[["likes"]]).transform(df_cat[["likes"]])
df_cat["dislikes_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_cat[["dislikes"]]).transform(df_cat[["dislikes"]])
df_cat["comment_count_scaled"] = MinMaxScaler(feature_range=(0, 1)).fit(df_cat[["comment_count"]]).transform(df_cat[["comment_count"]])

In [41]:
df_cat["weighted_sorting_score"] = weighted_sorting(df_cat,'comment_count_scaled','views_scaled','likes_scaled','dislikes_scaled')

In [42]:
df_cat = df_cat.drop(["views","likes","dislikes","comment_count"],axis=1)

In [43]:
df_cat.sort_values("weighted_sorting_score",ascending=False)

In [44]:
df_cat1 = df_cat.reset_index()

In [45]:
plt.figure(figsize=(10,6))
sns.barplot(x='category', 
            y="weighted_sorting_score", 
            data=df_cat1, 
            order=df_cat1.sort_values('weighted_sorting_score',ascending = False).category.head())

plt.xlabel("Video Category", size=15)
plt.ylabel("Weighted Sorting Score", size=15)
plt.title("Top 5 Video Categories on Youtube", size=18)
plt.tight_layout()


# Conclusion:
* The music category is in the !st place on Youtube Platform.
* The entertainment category comes after the music category.