### Importing Required Libararies

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
comments = pd.read_csv(
   r"C:\Users\rocke\OneDrive\Documents\Youtube_Data/UScomments.csv",
    on_bad_lines="skip"
)


In [4]:
comments.head(5)


Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [5]:
type(comments)

pandas.core.frame.DataFrame

### Find total missing values per column

In [6]:
comments.isnull().sum()

video_id         0
comment_text    26
likes            0
replies          0
dtype: int64

### Drop all rows that contain any missing (NaN) values from the comments DataFrame, and apply the change directly to the original DataFrame.

In [7]:
comments.dropna(inplace=True)

#Rows with any missing data were removed.

In [8]:
comments.isnull().sum()

video_id        0
comment_text    0
likes           0
replies         0
dtype: int64

### 📦 Downloading VADER Lexicon for Sentiment Analysis & 🧠 Importing VADER Sentiment Analyzer and Accessing Comment Text

In [None]:
import nltk

In [None]:

nltk.download("vader_lexicon")

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
comments["comment_text"]

In [None]:
sia.polarity_scores("MY FAN . attendance")['compound']

In [None]:
sentimen_scores=[]
for comment in comments["comment_text"]:
    score = sia.polarity_scores(str(comment))['compound']
    sentimen_scores.append(score)

In [None]:
sample_df = comments[0:10000]

In [None]:
# sentimen_scores

In [None]:
comments["polarity"] = sentimen_scores

In [None]:
comments.head(5)

### Word cloud analysis of data 


### This is for positive sentiment 

In [None]:
filter_pos = (comments["polarity"] >= 0.8) & ((comments["polarity"] <=1.0))

In [None]:
comments_positive = comments[filter_pos]

In [None]:
comments_positive.shape

### This is for negative sentiment

In [None]:
filter_neg = (comments["polarity"] >= -1.0) & ((comments["polarity"] <=-0.8))

In [None]:
comments_negative = comments[filter_neg]

In [None]:
comments_negative.shape

In [None]:
#!pip install wordcloud

In [None]:
comments_positive["comment_text"]

In [None]:
type(comments_positive["comment_text"])

In [None]:
total_positive_comments = ' '.join(comments_positive["comment_text"])

In [None]:
total_positive_comments[0:1000]

In [None]:
#set(STOPWORDS)

In [None]:
from wordcloud import WordCloud, STOPWORDS

In [None]:
wordcloud_positive = WordCloud(stopwords= set(STOPWORDS)).generate(total_positive_comments)

In [None]:
plt.imshow(wordcloud_positive)
plt.axis("off")

In [None]:
total_negative_comments = ' '.join(comments_negative["comment_text"])

In [None]:
wordcloud_negative = WordCloud(stopwords= set(STOPWORDS)).generate(total_negative_comments)

In [None]:
plt.imshow(wordcloud_negative)
plt.axis("off")

### Perform Emoji's Analysis

In [None]:
!pip install emoji

In [None]:
import emoji 

In [None]:
emoji.__version__

In [None]:
comments.head(5)

In [None]:
emoji_info = emoji.emoji_list("trending 😉")

In [None]:
emoji_info

In [None]:
[item["emoji"] for item in emoji_info]

In [None]:
comments["comment_text"]

In [None]:
all_emoji_found = []
for comment in comments["comment_text"]:
    emoji_info = emoji.emoji_list(comment)
    emoji_found = [item["emoji"] for item in emoji_info]
    all_emoji_found.extend(emoji_found)

In [None]:
all_emoji_found[0:10]

In [None]:
len(all_emoji_found)

In [None]:
from collections import Counter 

In [None]:
emoji_count_list_top10 = Counter(all_emoji_found).most_common(10)

In [None]:
emoji_count_list_top10

In [None]:
emojis = [emoji for emoji, count in emoji_count_list_top10]
counts = [count for emoji, count in emoji_count_list_top10]

In [None]:
emojis

In [None]:
counts

In [None]:
!pip install plotly

In [None]:
import plotly.graph_objs as go
from plotly.offline import iplot

In [None]:
iplot([go.Bar(x = emojis , y = counts)])

### Total hours of games played

In [None]:
games_name = ["Dota 2","Fortnite","Squad","Brawl Stars"]

In [None]:
hours_played = [3000,1000,400,2000]

In [None]:
iplot([go.Bar(x = games_name, y = hours_played)])

# Collect entire youtube data collection 

In [None]:
import os

In [None]:
files = os.listdir(r'C:\Users\rocke\Downloads\Youtube_Data_Analysis\Dataset\additional_data')

In [None]:
files

In [None]:
files_csv = [file for file in files if '.csv' in file]

In [None]:
files_csv

In [None]:
import warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import pandas as pd

In [None]:
full_df = pd.DataFrame()
path = r'C:\Users\rocke\Downloads\Youtube_Data_Analysis\Dataset\additional_data'

for file in files_csv:
    current_df = pd.read_csv(os.path.join(path, file), encoding='iso-8859-1', on_bad_lines="skip")
    full_df = pd.concat([full_df, current_df], ignore_index=True)

In [None]:
full_df.shape

# Export Data in to CSV, JSON, Databases , etc 

In [None]:
full_df.duplicated()

In [None]:
full_df = full_df.drop_duplicates()

In [None]:
full_df.shape

# Export to csv files

In [None]:
full_df[0:1000].to_csv(r'C:\Users\rocke\Downloads\Youtube\export_data/youtube_sample.csv', index=False)

# Export to json files

In [None]:
full_df[0:1000].to_json(r'C:\Users\rocke\Downloads\Youtube\export_data/youtube_sample.json')

# Export to database 

In [None]:
from sqlalchemy import create_engine

In [None]:
engine = create_engine(r'sqlite:///C:\Users\rocke\Downloads\Youtube\export_data/youtube_sample.sqlite')

In [None]:
full_df[0:1000].to_sql('Users', con=engine, if_exists='append',index=False)

# Analysing the most liked category

In [None]:
full_df.head(5)

In [None]:
full_df['category_id'].unique()

In [None]:
json_df = pd.read_json(r'C:\Users\rocke\Downloads\Youtube_Data_Analysis\Dataset\additional_data/US_category_id.json')

In [None]:
json_df['items'][0]

In [None]:
json_df['items'][1]

In [None]:
cat_dict = {}
for item in json_df['items'].values:
    cat_dict[int(item['id'])] = item['snippet']['title']

In [None]:
cat_dict

In [None]:
full_df['category_name'] = full_df['category_id'].map(cat_dict)

In [None]:
full_df.head(4)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.figure(figsize=(12,8))
sns.boxplot(x='category_name', y='likes', data=full_df)
plt.xticks(rotation='vertical')

# Analyzing Youtube Audience Engagement

In [None]:
full_df['like_rate'] = full_df['likes']/full_df['views']*100
full_df['dislike_rate'] = full_df['dislikes']/full_df['views']*100
full_df['comment_count_rate'] = full_df['comment_count']/full_df['views']*100

In [None]:
full_df.columns

In [None]:
full_df.head(5)

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='category_name', y='like_rate', data=full_df)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.regplot(x='views', y='likes', data=full_df)
plt.show()

In [None]:
full_df.columns

In [None]:
full_df[['views','likes','dislikes']]

In [None]:
full_df[['views','likes','dislikes']].corr()

In [None]:
sns.heatmap(full_df[['views','likes','dislikes']].corr(),annot=True)

# Analyzing Trending Youtube Videos by Channel

In [None]:
full_df.head(6)

In [None]:
full_df['channel_title'].value_counts()

In [None]:
cdf = full_df.groupby(['channel_title']).size().sort_values(ascending=False).reset_index()

In [None]:
cdf

In [None]:
#cdf.rename(columns={0:'total_videos'})

In [None]:
cdf.columns=['channel_title', 'total_videos']

In [None]:
cdf 

In [None]:
import plotly.express as px 

In [None]:
fig = px.bar(cdf[:20], x='channel_title', y='total_videos',title="Top 20 Channels by Number of Videos")
fig.show()


# Does punctuation have an impact on views, likes and dislikes?

In [None]:
full_df['title'][0]

In [None]:
import string

In [None]:
string.punctuation

In [None]:
len([char for char in full_df['title'][0] if char in string.punctuation])

In [None]:
def punc_count(text):
    return len([char for char in text if char in string.punctuation])

In [None]:
sample= full_df[0:10000]

In [None]:
sample['count_punc'] = sample['title'].apply(punc_count)

In [None]:
sample['count_punc'] 

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc', y='views', data=sample)
plt.title("Relationship Between Punctuation Frequency and Views")
plt.show()

In [None]:
plt.figure(figsize=(8,6))
sns.boxplot(x='count_punc', y='likes', data=sample)
plt.title("Relationship Between Punctuation Frequency and Likes")
plt.show()