## Project Overview : 

* Extracted data from a YouTube playlist using the YouTube API 🔍.
* Transformed, organized, and processed the dataset using Python's Pandas library 🐍.
* Cleaned the data to make it more organized 🧹.
* Visualized the processed data using Power BI 📊.
* Created custom reports and dashboards with ease using Power BI's user-friendly interface 📈

## Task for youtube web scrapping : 

 >1. Get the playlist id of Sandeep maheshwari youtube channel playlist "Meet Your Favourite YouTuber!"
 >2. Get all the videoIds of SM playlist
 >3. Get title,date,views,like,comments,duration of each video
 >4. Cleaning the Data set for Analysis.

 

In [None]:
# Installation  of google api library
pip install google-api-python-client

In [4]:
#import libraries : 

from googleapiclient.discovery import build
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [5]:
# private api key
api_key = "AIzaSyDhwAAo91fu-L9_5lhWBYW3K5q8I" #private api key

#We need to create the service. : Construct a Resource for interacting with an API.

youtube = build("youtube","v3",developerKey= api_key)

### Task 1.  Extracting video IDs from playlist ID 

In [6]:
#playlist ID of Meet Your Favourite YouTuber!
playlist_id = "PLWc1yfTYfqNGR7ZTyxTjGv7xOJIGZKe8O"


#function to get all video id of playlist
def get_video_id_from_playlist(youtube,playlist_id) :
    
    request = youtube.playlistItems().list(part= "snippet,contentDetails",
                                                  maxResults=50,
                                                  playlistId=playlist_id)
                                                 
    response = request.execute()
    
    playlist_data = []
    for i in range(len(response['items'])) : 
        eachvideo_id  = dict(title = response['items'][i]['snippet']['title'],
                        video_id = response['items'][i]['contentDetails']['videoId'])
        
        playlist_data.append(eachvideo_id)
        
    next_page_token = response.get('nextPageToken')
    more_pages = True
    
    while more_pages == True : 
        if next_page_token is None : #if there is no page left
            more_pages = False
        
        else : 
            request = youtube.playlistItems().list(part= "snippet,contentDetails",
                                                  maxResults=50,
                                                  playlistId=playlist_id,
                                                  pageToken = next_page_token)
                                                 
            response = request.execute()
        
            for i in range(len(response['items'])) : 
                eachvideo_id  = dict(title = response['items'][i]['snippet']['title'],
                                     video_id = response['items'][i]['contentDetails']['videoId'])
                playlist_data.append(eachvideo_id)
              
            # store next page
            next_page_token = response.get('nextPageToken')
                
            
            
        

    return playlist_data





playlist_data = get_video_id_from_playlist(youtube,playlist_id)

In [7]:
#printing first 5 rows : 
playlist_data[:5]

[{'title': 'Meet Raft Motors Founders | Episode 81',
  'video_id': 'ejXrmLMNXSg'},
 {'title': 'Meet Groww Founder Lalit Keshre | Episode 80',
  'video_id': 'ju49EpQazLA'},
 {'title': 'Meet Entrepreneur Pramod Raj Shukla | Episode 79',
  'video_id': 'ywzowpv1w7E'},
 {'title': 'Meet Shradha Sharma YourStory Founder | Episode 78',
  'video_id': '1aS8u6LQfvc'},
 {'title': 'Meet Ridhima Arora Namhya Foods Founder | Episode 77',
  'video_id': 'tJX1-YxU6AE'}]

In [8]:
# Saving dictionary into Dataframe : 


playlist_table = pd.DataFrame(playlist_data)

#printing first 5 rows of Dataset : 
playlist_table.head(5)

Unnamed: 0,title,video_id
0,Meet Raft Motors Founders | Episode 81,ejXrmLMNXSg
1,Meet Groww Founder Lalit Keshre | Episode 80,ju49EpQazLA
2,Meet Entrepreneur Pramod Raj Shukla | Episode 79,ywzowpv1w7E
3,Meet Shradha Sharma YourStory Founder | Episod...,1aS8u6LQfvc
4,Meet Ridhima Arora Namhya Foods Founder | Epis...,tJX1-YxU6AE


In [9]:
playlist_table.info() # total 82 guests came

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82 entries, 0 to 81
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   title     82 non-null     object
 1   video_id  82 non-null     object
dtypes: object(2)
memory usage: 1.4+ KB


### Task 2  Extracting video contents from each video ID

In [10]:
# list of all video IDs 
video_ids = list(playlist_table['video_id'])


all_video_data = []
    

#function to get videos details of each video ID
def get_video_details_from_video_id(youtube,video_ids) : 
    
    
    for k in range(0,len(video_ids),50) : 
        request = youtube.videos().list(
                                        part="snippet,contentDetails,statistics",
                                        id= ','.join(video_ids[k : k + 50]) # max limit accesing ids is 50
                                       )
        response = request.execute()
    
        for i in range(len(response['items'])) : 
            each_video_data = dict(video_title = response['items'][i]['snippet']['title'],
                 publish_date = response['items'][i]['snippet']['publishedAt'],
                 duration = response['items'][i]['contentDetails']['duration'],
                 total_views = response['items'][i]['statistics']['viewCount'],
                 total_likes = response['items'][i]['statistics']['likeCount'],
                 total_comment = response['items'][i]['statistics']['commentCount'])

            all_video_data.append(each_video_data)
    
    
    return all_video_data




all_videos_data_dict = get_video_details_from_video_id(youtube,video_ids)


In [231]:
#printing first 5 rows 
all_videos_data_dict[:5]

[{'video_title': 'Meet Raft Motors Founders | Episode 81',
  'publish_date': '2023-01-25T08:05:13Z',
  'duration': 'PT26M29S',
  'total_views': '318868',
  'total_likes': '19021',
  'total_comment': '2134'},
 {'video_title': 'Meet Groww Founder Lalit Keshre | Episode 80',
  'publish_date': '2023-01-19T10:38:33Z',
  'duration': 'PT16M45S',
  'total_views': '588282',
  'total_likes': '31804',
  'total_comment': '3814'},
 {'video_title': 'Meet Entrepreneur Pramod Raj Shukla | Episode 79',
  'publish_date': '2023-01-16T07:58:13Z',
  'duration': 'PT22M16S',
  'total_views': '514631',
  'total_likes': '27631',
  'total_comment': '4782'},
 {'video_title': 'Meet Shradha Sharma YourStory Founder | Episode 78',
  'publish_date': '2023-01-12T12:32:23Z',
  'duration': 'PT21M55S',
  'total_views': '585720',
  'total_likes': '32782',
  'total_comment': '2599'},
 {'video_title': 'Meet Ridhima Arora Namhya Foods Founder | Episode 77',
  'publish_date': '2023-01-10T13:19:51Z',
  'duration': 'PT17M20S',

In [11]:
#Converting in to Data frame
videos_table = pd.DataFrame(all_videos_data_dict)

# printing first 5 rows of data



print('Done !!! We sucessfully scrap all the playlist video details !! ')
videos_table.head()

Done !!! We sucessfully scrap all the playlist video details !! 


Unnamed: 0,video_title,publish_date,duration,total_views,total_likes,total_comment
0,Meet Raft Motors Founders | Episode 81,2023-01-25T08:05:13Z,PT26M29S,371269,20934,2787
1,Meet Groww Founder Lalit Keshre | Episode 80,2023-01-19T10:38:33Z,PT16M45S,605815,32431,3840
2,Meet Entrepreneur Pramod Raj Shukla | Episode 79,2023-01-16T07:58:13Z,PT22M16S,522231,27875,4796
3,Meet Shradha Sharma YourStory Founder | Episod...,2023-01-12T12:32:23Z,PT21M55S,597991,33351,2620
4,Meet Ridhima Arora Namhya Foods Founder | Epis...,2023-01-10T13:19:51Z,PT17M20S,730348,35226,2471


## Task 3 Data cleaning : 

### 1.  Arranging videos from earliest to latest : 

In [12]:
videos_table = videos_table[::-1].reset_index(drop = True).iloc[1:]

### 2.  Basics overview of data set : 


In [13]:
videos_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 1 to 81
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   video_title    81 non-null     object
 1   publish_date   81 non-null     object
 2   duration       81 non-null     object
 3   total_views    81 non-null     object
 4   total_likes    81 non-null     object
 5   total_comment  81 non-null     object
dtypes: object(6)
memory usage: 3.9+ KB


In [14]:
videos_table.head() # before

Unnamed: 0,video_title,publish_date,duration,total_views,total_likes,total_comment
1,Meet Aman Dhattarwal | Episode 1,2022-05-25T07:04:17Z,PT26M30S,4785815,329711,12007
2,Meet Himanshi Singh | Episode 2,2022-06-01T08:03:10Z,PT23M1S,4341495,256433,7162
3,Meet Muddassir Khan | Founder of YeBook | Epis...,2022-06-08T12:30:11Z,PT20M46S,1660859,102423,4789
4,Meet MBA Chai Wala | Prafull Billore | Episode 4,2022-06-13T14:30:13Z,PT22M33S,12260590,777808,17324
5,Meet Vivek Mittal | Fit Tuber | Episode 5,2022-06-18T11:58:06Z,PT20M57S,2882346,158908,5198


### 3. Dtype Conversions

### 3.1 Converting integers columns in int dtype

In [15]:

int_cols = ['total_views', 'total_likes','total_comment']

# converting integers columns in int dtype
for col in int_cols : 
    videos_table[col] = videos_table[col].astype('int64')
    

### 3.2  Converting public date in to date time object : 

In [16]:
videos_table['publish_date'] = pd.to_datetime(videos_table['publish_date']).dt.date

### 3.3 Converting duration column in to hour -minutes-sec

In [17]:
from datetime import datetime
    
def convert_time(string) : 
    return datetime.strptime(string, "%MM%SS").time()
    
videos_table['duration'] = videos_table['duration'].str[2:].apply(convert_time)


### 4.Cleaning video_title 

In [18]:
videos_table['Name'] = videos_table['video_title'].str.split('|',n = 1, expand = True)[0]
videos_table['Name'] = videos_table['Name'].str.replace("Meet","").str.strip()

videos_table = videos_table.reindex(columns=['video_title','Name','publish_date', 'duration', 'total_views', 'total_likes',
       'total_comment'])

In [19]:

print('Cleaning is complete now !!!')
videos_table.head()

Cleaning is complete now !!!


Unnamed: 0,video_title,Name,publish_date,duration,total_views,total_likes,total_comment
1,Meet Aman Dhattarwal | Episode 1,Aman Dhattarwal,2022-05-25,00:26:30,4785815,329711,12007
2,Meet Himanshi Singh | Episode 2,Himanshi Singh,2022-06-01,00:23:01,4341495,256433,7162
3,Meet Muddassir Khan | Founder of YeBook | Epis...,Muddassir Khan,2022-06-08,00:20:46,1660859,102423,4789
4,Meet MBA Chai Wala | Prafull Billore | Episode 4,MBA Chai Wala,2022-06-13,00:22:33,12260590,777808,17324
5,Meet Vivek Mittal | Fit Tuber | Episode 5,Vivek Mittal,2022-06-18,00:20:57,2882346,158908,5198


### Saving the file to .csv : 

In [20]:
videos_table.to_csv('sm_playlist_videos.csv',index= False)

### Final Dataset is ready to get analyse : 

In [22]:
videos_table

Unnamed: 0,video_title,Name,publish_date,duration,total_views,total_likes,total_comment
1,Meet Aman Dhattarwal | Episode 1,Aman Dhattarwal,2022-05-25,00:26:30,4785815,329711,12007
2,Meet Himanshi Singh | Episode 2,Himanshi Singh,2022-06-01,00:23:01,4341495,256433,7162
3,Meet Muddassir Khan | Founder of YeBook | Epis...,Muddassir Khan,2022-06-08,00:20:46,1660859,102423,4789
4,Meet MBA Chai Wala | Prafull Billore | Episode 4,MBA Chai Wala,2022-06-13,00:22:33,12260590,777808,17324
5,Meet Vivek Mittal | Fit Tuber | Episode 5,Vivek Mittal,2022-06-18,00:20:57,2882346,158908,5198
...,...,...,...,...,...,...,...
77,Meet Ridhima Arora Namhya Foods Founder | Epis...,Ridhima Arora Namhya Foods Founder,2023-01-10,00:17:20,730348,35226,2471
78,Meet Shradha Sharma YourStory Founder | Episod...,Shradha Sharma YourStory Founder,2023-01-12,00:21:55,597991,33351,2620
79,Meet Entrepreneur Pramod Raj Shukla | Episode 79,Entrepreneur Pramod Raj Shukla,2023-01-16,00:22:16,522231,27875,4796
80,Meet Groww Founder Lalit Keshre | Episode 80,Groww Founder Lalit Keshre,2023-01-19,00:16:45,605815,32431,3840
