<a href="https://colab.research.google.com/github/petermesy/scrap/blob/main/Sentimnet_data_Scrap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-api-python-client



In [None]:
import os
import json
from googleapiclient.discovery import build
import re
from transformers import pipeline


In [None]:
def clean_comment(text):

    text = re.sub(r"http\S+|www\S+", "", text)

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "\U0000200D"             # Zero Width Joiner
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00002300-\U000023FF"  # Miscellaneous Technical
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)

    text = re.sub(r"[a-zA-Z]", "", text)

    text = re.sub(r"\s+", " ", text).strip()

    return text

In [None]:
def fetch_amharic_comments(video_id, api_key):

    youtube = build("youtube", "v3", developerKey=api_key)

    amharic_comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100
        )
        response = request.execute()

        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]
            text = comment["textDisplay"]
            if re.search(r'[\u1200-\u137F]', text):
                cleaned_text = clean_comment(text)
                if cleaned_text:  # Only add if the cleaned text is not empty
                    amharic_comments.append({
                        "comment_id": item["id"],
                        "text": cleaned_text,
                        # "author": comment.get("authorDisplayName", "Unknown"),
                        # "published_at": comment["publishedAt"]
                    })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return amharic_comments

In [None]:
def save_comments_to_jsonl(comments, folder, filename):

    os.makedirs(folder, exist_ok=True)

    filepath = os.path.join(folder, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        for comment in comments:
            f.write(json.dumps(comment, ensure_ascii=False) + "\n")
    print(f"Comments saved to {filepath}")

In [None]:
if __name__ == "__main__":
    api_key = "YOUR_API_KEY"

    video_ids = [
        "peIQN5jueQ0",
        "46cVVbBGbjY",
        "jIqOI662_dA",
        "_YddTPxw_68",
        "VT82OK6n3jw",
        "FsEEU4p5BZk",
        "0aP3FCNYbVc",
        "Lu8Or-WFZsk",
        "2aAU4m_uoTw",
        "rY_yW8QSSuQ",
        "hGMBe6Agq9c",
        "dXdAxoC4Pcc",
        "gV0fb0MGuH0",
        "a93nBqQJFOU",
        "hGMBe6Agq9c",
        "jIqOI662_dA",
         "pdGZZ54rmBM",
        "IE2AeGsBszA",
        "JPJsvIdqSIc",
        "ppfP5ZAO-Hg",
        "09Vkusq6U_s",
        "rmuaitA0CC8",
        "NACvhw4O7FU",
        "_WUaVftDl0s",
        "fXCTS47cYWc",
        "PLoY2qPbCCU",
        "V52XF2fzu1g"
    ]

    output_folder = "Amharic_comments"

    for video_id in video_ids:
        print(f"Fetching Amharic comments for video ID: {video_id}")
        amharic_comments = fetch_amharic_comments(video_id, api_key)

        filename = f"comments_{video_id}.jsonl"
        save_comments_to_jsonl(amharic_comments, output_folder, filename)

Fetching Amharic comments for video ID: peIQN5jueQ0
Comments saved to Amharic_comments/comments_peIQN5jueQ0.jsonl
Fetching Amharic comments for video ID: 46cVVbBGbjY
Comments saved to Amharic_comments/comments_46cVVbBGbjY.jsonl
Fetching Amharic comments for video ID: jIqOI662_dA
Comments saved to Amharic_comments/comments_jIqOI662_dA.jsonl
Fetching Amharic comments for video ID: _YddTPxw_68
Comments saved to Amharic_comments/comments__YddTPxw_68.jsonl
Fetching Amharic comments for video ID: VT82OK6n3jw
Comments saved to Amharic_comments/comments_VT82OK6n3jw.jsonl
Fetching Amharic comments for video ID: FsEEU4p5BZk
Comments saved to Amharic_comments/comments_FsEEU4p5BZk.jsonl
Fetching Amharic comments for video ID: 0aP3FCNYbVc
Comments saved to Amharic_comments/comments_0aP3FCNYbVc.jsonl
Fetching Amharic comments for video ID: Lu8Or-WFZsk
Comments saved to Amharic_comments/comments_Lu8Or-WFZsk.jsonl
Fetching Amharic comments for video ID: 2aAU4m_uoTw
Comments saved to Amharic_comments/c

In [None]:
def read_comments_from_folder(folder_path):

    all_comments = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):  # Process only JSONL files
            file_path = os.path.join(folder_path, filename)
            print(f"Reading comments from: {file_path}")

            # Read the JSONL file
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    comment_data = json.loads(line.strip())
                    all_comments.append(comment_data["text"])  # Extract the comment text

    return all_comments

In [None]:
def save_labeled_data_to_jsonl(labeled_data, output_file):
    """
    Saves labeled data to a JSONL file.

    Args:
        labeled_data (list): A list of dictionaries containing labeled comments.
        output_file (str): The path to the output JSONL file.
    """
    with open(output_file, "w", encoding="utf-8") as f:
        for data in labeled_data:
            f.write(json.dumps(data, ensure_ascii=False) + "\n")
    print(f"Labeled data saved to {output_file}")

In [None]:

if __name__ == "__main__":
    # Path to the folder containing JSONL files
    folder_path = "Comments"  # Replace with the actual folder path

    # Read comments from the folder
    comments = read_comments_from_folder("/content/Amharic_comments")

    # Load a pre-trained sentiment analysis pipeline
    sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

    # Perform sentiment analysis
    labeled_data = []
    for comment in comments:
        sentiment = sentiment_analyzer(comment)[0]
        labeled_data.append({"comment": comment, "sentiment": sentiment["label"]})

    # Save the labeled data to a JSONL file


    # Print the labeled data
    for data in labeled_data:
        print(data)
    output_file = "labeled_comments.jsonl"  # Replace with your desired output file path
    save_labeled_data_to_jsonl(labeled_data, output_file)

Reading comments from: /content/Amharic_comments/comments_09Vkusq6U_s.jsonl
Reading comments from: /content/Amharic_comments/comments_fXCTS47cYWc.jsonl
Reading comments from: /content/Amharic_comments/comments_ppfP5ZAO-Hg.jsonl
Reading comments from: /content/Amharic_comments/comments_jIqOI662_dA.jsonl
Reading comments from: /content/Amharic_comments/comments_rmuaitA0CC8.jsonl
Reading comments from: /content/Amharic_comments/comments_rY_yW8QSSuQ.jsonl
Reading comments from: /content/Amharic_comments/comments_dXdAxoC4Pcc.jsonl
Reading comments from: /content/Amharic_comments/comments_NACvhw4O7FU.jsonl
Reading comments from: /content/Amharic_comments/comments__WUaVftDl0s.jsonl
Reading comments from: /content/Amharic_comments/comments_46cVVbBGbjY.jsonl
Reading comments from: /content/Amharic_comments/comments_peIQN5jueQ0.jsonl
Reading comments from: /content/Amharic_comments/comments_PLoY2qPbCCU.jsonl
Reading comments from: /content/Amharic_comments/comments__YddTPxw_68.jsonl
Reading comm

config.json:   0%|          | 0.00/953 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/669M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/872k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Device set to use cpu


{'comment': 'ፍቅር ሶሲጋ አረ ወይኔ ተቃጠልኩ', 'sentiment': '5 stars'}
{'comment': 'ቹቻዬ የኔ ጀግነ እሰይ ደግ አደረግሽዉ ፀገ ስየንስብሽ ነዉ', 'sentiment': '5 stars'}
{'comment': 'ወይኔ ሶሲ', 'sentiment': '5 stars'}
{'comment': 'እረ ፀጋን አባሩልን', 'sentiment': '5 stars'}
{'comment': 'ቹቻየ ይመችሽ የኔ ቆንጆ', 'sentiment': '5 stars'}
{'comment': 'እቺ የ ቡርኪናፋሶ ጠንቃይ የመሰለችውን ወይዘሪት ፀጋ አፍልታ ነበር ቹቻዬ እጆችሽ ይባረኩ', 'sentiment': '5 stars'}
{'comment': 'ፀጋ ስያንሰት ነው ሀሹ ገነ ምን አይተሽ ከአፍላ ፍቅር ትወጪአለሽ አንችን እኔ ብያገኝሽ እበረግዲሸለው።አንች መጥፎ &#39;ኩፉ &#39;አረማኔ &#39;ጨካኝ ሀሹ', 'sentiment': '5 stars'}
{'comment': 'ይመቻት ይች ካታሊሥት', 'sentiment': '5 stars'}
{'comment': 'ሄሉ ቦቅባቃ ቁጥር በትሠጥ', 'sentiment': '5 stars'}
{'comment': 'ፀጋ መቼናዉ ምት ሞቶዉ', 'sentiment': '5 stars'}
{'comment': 'የመጨረሻ ነኝ ወላ ጀዋሌ ተበላሽቶ', 'sentiment': '5 stars'}
{'comment': 'እሰይ ፀጋ ይገባሻል', 'sentiment': '5 stars'}
{'comment': 'ፀጋ የሴጠን ቁራጭ', 'sentiment': '5 stars'}
{'comment': 'በጠም ደስ ብሎኛል', 'sentiment': '5 stars'}
{'comment': 'አልሀም ዱሊላ', 'sentiment': '5 stars'}
{'comment': 'ጸጋ ብትወጣ', 'sentiment': '5 stars'}

In [None]:
if __name__ == "__main__":
    # Path to the folder containing JSONL files
    folder_path = "/content/Amharic_comments"  # Replace with the actual folder path

    # Read comments from the folder
    comments = read_comments_from_folder(folder_path)

    labeled_data = [{"comment": comment, "sentiment": label_comment(comment)} for comment in comments]

    # Print the labeled data
    for data in labeled_data:
        print(data)

Reading comments from: /content/Amharic_comments/comments_0aP3FCNYbVc.jsonl
Reading comments from: /content/Amharic_comments/comments_Lu8Or-WFZsk.jsonl
Reading comments from: /content/Amharic_comments/comments_rY_yW8QSSuQ.jsonl
Reading comments from: /content/Amharic_comments/comments_gV0fb0MGuH0.jsonl
Reading comments from: /content/Amharic_comments/comments_VT82OK6n3jw.jsonl
Reading comments from: /content/Amharic_comments/comments_dXdAxoC4Pcc.jsonl
Reading comments from: /content/Amharic_comments/comments_2aAU4m_uoTw.jsonl
Reading comments from: /content/Amharic_comments/comments__YddTPxw_68.jsonl
Reading comments from: /content/Amharic_comments/comments_peIQN5jueQ0.jsonl
Reading comments from: /content/Amharic_comments/comments_a93nBqQJFOU.jsonl
Reading comments from: /content/Amharic_comments/comments_FsEEU4p5BZk.jsonl
Reading comments from: /content/Amharic_comments/comments_46cVVbBGbjY.jsonl
Reading comments from: /content/Amharic_comments/comments_jIqOI662_dA.jsonl
Reading comm

In [None]:

# Define positive and negative keywords
# positive_keywords = ["ጥሩ", "አሪፍ", "በጣም ጥሩ", "አመሰግናለሁ"]
# negative_keywords = ["መጥፎ", "አልወደድኩም", "አስቸጋሪ", "አልቻልኩም"]

def label_comment(comment):
    """
    Labels a comment as positive, negative, or neutral based on keywords.
    """
    if any(word in comment for word in positive_keywords):
        return "positive"
    elif any(word in comment for word in negative_keywords):
        return "negative"
    else:
        return "neutral"

def read_comments_from_folder(folder_path):
    """
    Reads comments from all JSONL files in a folder.

    Args:
        folder_path (str): The path to the folder containing JSONL files.

    Returns:
        list: A list of comments read from the files.
    """
    all_comments = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):  # Process only JSONL files
            file_path = os.path.join(folder_path, filename)
            print(f"Reading comments from: {file_path}")

            # Read the JSONL file
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    comment_data = json.loads(line.strip())
                    all_comments.append(comment_data["text"])  # Extract the comment text

    return all_comments

def save_labeled_data_to_jsonl(labeled_data, output_file):
    """
    Saves labeled data to a JSONL file.

    Args:
        labeled_data (list): A list of dictionaries containing labeled comments.
        output_file (str): The path to the output JSONL file.
    """
    with open(output_file, "w", encoding="utf-8") as f:
        for data in labeled_data:
            f.write(json.dumps(data, ensure_ascii=False) + "\n")
    print(f"Labeled data saved to {output_file}")


if __name__ == "__main__":
    # Path to the folder containing JSONL files
    folder_path = "/content/Amharic_comments"  # Replace with the actual folder path

    # Read comments from the folder
    comments = read_comments_from_folder(folder_path)

    # Label the comments
    labeled_data = [{"comment": comment, "sentiment": label_comment(comment)} for comment in comments]

    # Print the labeled data
    for data in labeled_data:
        print(data)
    # Save the labeled data to a JSONL file
    output_file = "labeled_comments.jsonl"  # Replace with your desired output file path
    save_labeled_data_to_jsonl(labeled_data, output_file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'comment': 'በስተመጨረሻም አንድ ብር ተዘረዘረ (ቤካ የህዝብ ልጅ ከታይላንድ)', 'sentiment': 'positive'}
{'comment': 'ዛሬ ለ 2ተኛ ጊዜ የወቸው ጉድን ሙሉ ጨርሼ አየሁ ከ ቀጥሎ', 'sentiment': 'positive'}
{'comment': 'ዮናስ አርፈህ ተቀመጥ', 'sentiment': 'positive'}
{'comment': 'ሴቷ እና ጥቁር ቲሸርት የለበስከው ልጅ አጠያየቃችሁ ደስ አይልም ለዛ የለውም!', 'sentiment': 'positive'}
{'comment': '1 ብር ያራዳ ልጅ ሰይፉ ፋንታሁን የተወለደው ሀረር ,ቦቴ ሰፈር ነው ከዛ ወደ አ.አ አድርጎ ነው', 'sentiment': 'positive'}
{'comment': 'አቦ ዛሬ ያበደ እንግዳ ነዉ ያመጣችሁት', 'sentiment': 'positive'}
{'comment': 'ቆሻሾች<>እና ቆሻሻ ስራችሁን ቆሻሻ መባል አለበት እበት በአፋችሁ እየተፀዳዳችሁ ጥሩ ነዉ እንድትባሉ ነዉ ምትፈልጉት', 'sentiment': 'positive'}
{'comment': 'ግን ሁሌ ወንድ በተለይ በመጣ ቁጥር ማሂን ጠርጥራቾት ትችሉታላችሁ አረ ፍቷት', 'sentiment': 'positive'}
{'comment': 'እረ ማሂን ተፋቷት ምንድነው ኮሜንቶች', 'sentiment': 'positive'}
{'comment': 'አቀራረባችሁን ግን አልወደውም ቧልት በዛበት', 'sentiment': 'positive'}
{'comment': 'በራሱ ቀልድ ፍርስ ብሎ ይስቃል', 'sentiment': 'positive'}
{'comment': 'ሀበሻ ሆናችሁ ለሀበሻ ያላችሁ ንቀት ይገርመኛል ምን አይነት ጀዝባ ፍጥረቶች እንደሆናችሁ'