In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
!pip install pytchat

Collecting pytchat
  Downloading pytchat-0.5.5-py3-none-any.whl.metadata (4.7 kB)
Collecting h2<5,>=3 (from httpx[http2]->pytchat)
  Downloading h2-4.1.0-py3-none-any.whl.metadata (3.6 kB)
Collecting hyperframe<7,>=6.0 (from h2<5,>=3->httpx[http2]->pytchat)
  Downloading hyperframe-6.0.1-py3-none-any.whl.metadata (2.7 kB)
Collecting hpack<5,>=4.0 (from h2<5,>=3->httpx[http2]->pytchat)
  Downloading hpack-4.0.0-py3-none-any.whl.metadata (2.5 kB)
Downloading pytchat-0.5.5-py3-none-any.whl (69 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.5/69.5 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading h2-4.1.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.5/57.5 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hpack-4.0.0-py3-none-any.whl (32 kB)
Downloading hyperframe-6.0.1-py3-none-any.whl (12 kB)
Installing collected packages: hyperframe, hpack, h2, pytchat
Successfully installed h2-

In [3]:
import pytchat
import time

In [4]:
import time

class ScraperConfig:
    """
    Configuration class for a web scraper.

    Attributes:
        patience (int): Time to wait between scraping attempts in seconds (default: 10).
        datetimes (list): List to store scraped datetimes (empty by default).
        authors (list): List to store scraped authors (empty by default).
        messages (list): List to store scraped messages (empty by default).
    """

    def __init__(self, patience=10):
        """
        Initializes a ScraperConfig object with the specified patience.

        Args:
            patience (int, optional): Time to wait between scraping attempts in seconds. Defaults to 10.
        """

        self.patience = patience
        self.datetimes = []
        self.authors = []
        self.messages = []


    def scrape(self, vid):
        """
        Scrapes data from the youtube comment.

        This placeholder method illustrates the general structure for scraping logic.
        The actual implementation would involve making requests, parsing responses, and
        storing data in self.datetimes, self.authors, and self.messages.

        Raises:
            Exception: If an error occurs during scraping.
        """

        try:
            start_time = time.time()

            chat = pytchat.create(video_id = vid)

            while chat.is_alive():
                for c in chat.get().items:
            #         print(f"{c.datetime} [{c.author.name}]- {c.message}")
                    self.datetimes.append(c.datetime)
                    self.authors.append(c.author.name)
                    self.messages.append(c.message)

                # Emergency Break
#                 if time.time() - start_time >= self.patience:
#                     break 
                    
        except Exception as e:
            print(f"Error during scraping: {e}")
            raise  # Re-raise the exception for potential handling outside the class
            
    def make_df(self):
        '''
        Make a df
        '''
        data = {
            'datetime': self.datetimes,
            'author': self.authors,
            'message': self.messages,
        }

        df = pd.DataFrame(data)
#         print(f'Length if this dataframe is {len(df)}')
        return df

In [5]:
configs = ScraperConfig(patience=10) 
configs.datetimes, configs.patience

([], 10)

In [6]:
configs.scrape(vid='4RZcdw-w2J0')
df = configs.make_df()
df.head()

Unnamed: 0,datetime,author,message
0,2024-06-30 04:55:16,ZAIN GAMING,iqoo Soule :rocket: :trophy:
1,2024-06-30 05:17:31,sangram,kya robbed bhai btayega
2,2024-06-30 05:28:58,Elamparthi,jokers :face_with_rolling_eyes:
3,2024-06-30 05:40:00,DARINDA GAMER,ajj Mera team jitega :upside_down_face::smilin...
4,2024-06-30 05:50:19,ƤŁΔ¥ βØ¥,this is Tamil chat :rolling_on_the_floor_laugh...


In [7]:
# x = 10
# df[x:x+10]

In [8]:
df.tail()

Unnamed: 0,datetime,author,message
9316,2024-06-30 13:55:54,Eren Giri,do you know top 10 team positions?
9317,2024-06-30 13:58:25,Redsign DEMI,Tl behind 4th position by 1 point
9318,2024-06-30 13:59:03,MOXLEY,3:crown: TAMIL TEAMS IN TOP10:red_heart::fire:
9319,2024-06-30 14:10:30,Nirmal,TT jokers.. from #2
9320,2024-06-30 14:13:03,Vivek Thangarasu,TT wins 8 lakhs money


In [9]:
df.shape

(9321, 3)

## Start with Bangladesh

In [10]:
bd_vinfo = pd.read_csv('/kaggle/input/esports-data/video_information/turkey_video_info.csv')

print(f'length of the dataframe is {len(bd_vinfo)}')

bd_vinfo.head(5)

length of the dataframe is 473


Unnamed: 0,video_id,title,view_count,like_count,comment_count,duration,date,time,duration_min
0,u8Sp6_DeXWA,YAYINCILAR ARASI ATLANTİS TURNUVASI 🔱 | PUBG M...,62898,1928,67,PT3H9M13S,2024-07-30,21:21:03,189.216667
1,NoNQnZO6JeY,[TR] 2024 PMWC x EWC Ana Turnuva 3. Gün | PUBG...,50123,1103,11,PT6H3M40S,2024-07-28,16:49:28,363.666667
2,Ho7NgSElv64,ECE SEÇKİN İLE EV PARTİ YAYINI | PUBG MOBILE,274848,177,12,PT36M21S,2024-07-28,13:58:46,36.35
3,XtGhVPvE2w8,[TR] 2024 PMWC x EWC Ana Turnuva 2. Gün | PUBG...,51514,1455,5,PT5H48M52S,2024-07-27,16:25:48,348.866667
4,xpzpdDMDXiY,[TR] 2024 PMWC x EWC Hayatta Kalma Aşaması 2....,49589,944,12,PT5H29M59S,2024-07-24,16:18:44,329.983333


## patience variable can be ignored it is used to terminate the scraping under a time threshold for experimenting.

In [11]:
# demo_list = ['Vruf1fDg4WA', 'xUSlfWh7Pak', '7FjuKffwKrs']
sconfigs = []
for i in range(len(bd_vinfo)):
    sconfigs.append(ScraperConfig(patience=4*3600))

print(len(sconfigs))
sconfigs[1].patience

473


14400

In [12]:
video_ids = list(bd_vinfo['video_id'])
video_ids[:4]

['u8Sp6_DeXWA', 'NoNQnZO6JeY', 'Ho7NgSElv64', 'XtGhVPvE2w8']

In [13]:
## keep an array to track the data frame length of each iteration

track = []


## DEclare class object for each video id
sconfigs = []
for i in range(len(bd_vinfo)):
    sconfigs.append(ScraperConfig(patience=4*3600))


# main df
columns = ['datetime', 'author', 'message', 'video_id']
main_df = pd.DataFrame(columns=columns)
print(main_df)

total_iterations = len(bd_vinfo['video_id'])

for index, vid in tqdm(enumerate(video_ids), total=total_iterations):
#     print(vid)
    sconfigs[index].scrape(vid=vid)
    temp_df = sconfigs[index].make_df()
    temp_df['video_id'] = vid
    track.append(len(temp_df))
    main_df = pd.concat([main_df, temp_df], ignore_index=True) 

Empty DataFrame
Columns: [datetime, author, message, video_id]
Index: []


100%|██████████| 473/473 [1:42:50<00:00, 13.05s/it]


In [14]:
print(track[:4])
main_df.shape

[4752, 1639, 257, 1710]


(2101365, 4)

In [15]:
print(track[:4])
main_df.shape

[4752, 1639, 257, 1710]


(2101365, 4)

In [16]:
main_df.head(3)

Unnamed: 0,datetime,author,message,video_id
0,2024-07-30 16:52:11,MUSTAFA KARADAĞ,vural abii,u8Sp6_DeXWA
1,2024-07-30 16:52:13,jeonggukkkie,evet kria dogru diosa mükemmel takim olur baya,u8Sp6_DeXWA
2,2024-07-30 16:52:14,MUSTAFA KARADAĞ,vural abii,u8Sp6_DeXWA


In [17]:
# main_df.drop_duplicates(inplace=True)
main_df.shape

(2101365, 4)

In [18]:
main_df.isna().sum()

datetime    0
author      0
message     0
video_id    0
dtype: int64

## save the df

In [19]:
main_df.to_parquet('turkey_livechat_data.parquet', index=False)

df = pd.read_parquet('/kaggle/working/turkey_livechat_data.parquet')
df.head(4)

Unnamed: 0,datetime,author,message,video_id
0,2024-07-30 16:52:11,MUSTAFA KARADAĞ,vural abii,u8Sp6_DeXWA
1,2024-07-30 16:52:13,jeonggukkkie,evet kria dogru diosa mükemmel takim olur baya,u8Sp6_DeXWA
2,2024-07-30 16:52:14,MUSTAFA KARADAĞ,vural abii,u8Sp6_DeXWA
3,2024-07-30 16:52:21,MUSTAFA KARADAĞ,vural abii,u8Sp6_DeXWA


In [20]:
df.shape

(2101365, 4)

### Check for `null values`

In [21]:
df.isna().sum()

datetime    0
author      0
message     0
video_id    0
dtype: int64

## OG code by oitik

In [22]:
# start_time = time.time()
# vid = "EoQjiNm7oCw"
# # vid = 'xUSlfWh7Pak'

# chat = pytchat.create(video_id = vid)

# while chat.is_alive():
#     for c in chat.get().items:
# #         print(f"{c.datetime} [{c.author.name}]- {c.message}")
#         configs.datetimes.append(c.datetime[:10])
#         configs.authors.append(c.author.name)
#         configs.messages.append(c.message)

# #     # Emergency Break
# #     if time.time() - start_time >= configs.patience:
# #         break 

In [23]:
# data = {
#     'datetime': configs.datetimes,
#     'author': configs.authors,
#     'message': configs.messages
# }

# df = pd.DataFrame(data)
# df.head()

In [24]:
# df.shape

In [25]:
# df.to_csv('esports_data.csv', index=False)

# df = pd.read_csv('/kaggle/working/esports_data.csv')
# df.head(4)

In [26]:
# df.dtypes

In [27]:
# df['length'] = [len(x.split()) for x in df['message']]

# df.head(2)

In [28]:
# sorted_df = df.sort_values(by='length', ascending=False).reset_index(drop=True)
# sorted_df.head(20)

In [29]:
# sorted_df['message'][50]

In [30]:
# import pytchat
# import time

# start_time = time.time()


# chat = pytchat.create(video_id='yuv1qvQywMU')

# while chat.is_alive():
# #     print(chat.get().json())
# #     time.sleep(5)
    
#     # Each chat item can also be output in JSON format.
#     for c in chat.get().items:
#         print(f"{c.datetime} [{c.author.name}]- {c.message}")
# #         print(c.json())

#     # Emergency Break
#     if time.time() - start_time >= 1:
#         break  

# `the end`