<a href="https://colab.research.google.com/github/petermesy/scrap/blob/main/Seintment_scrap_Label.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install google-api-python-client



In [None]:
import os
import json
from googleapiclient.discovery import build
import re
from transformers import pipeline


In [None]:
def clean_comment(text):

    text = re.sub(r"http\S+|www\S+", "", text)

    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # Emoticons
        "\U0001F300-\U0001F5FF"  # Symbols & Pictographs
        "\U0001F680-\U0001F6FF"  # Transport & Map Symbols
        "\U0001F1E0-\U0001F1FF"  # Flags
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251"  # Enclosed Characters
        "\U0000200D"             # Zero Width Joiner
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002600-\U000026FF"  # Miscellaneous Symbols
        "\U00002300-\U000023FF"  # Miscellaneous Technical
        "]+",
        flags=re.UNICODE,
    )
    text = emoji_pattern.sub("", text)

    text = re.sub(r"[a-zA-Z]", "", text)

    text = re.sub(r"\s+", " ", text).strip()
    # remove unneccessary tags like <>
    text = re.sub(r"<[^>]+>", "", text)



    return text

In [None]:
def fetch_amharic_comments(video_id, api_key):

    youtube = build("youtube", "v3", developerKey=api_key)

    amharic_comments = []
    next_page_token = None

    while True:
        request = youtube.commentThreads().list(
            part="snippet",
            videoId=video_id,
            pageToken=next_page_token,
            maxResults=100
        )
        response = request.execute()

        for item in response.get("items", []):
            comment = item["snippet"]["topLevelComment"]["snippet"]
            text = comment["textDisplay"]
            if re.search(r'[\u1200-\u137F]', text):
                cleaned_text = clean_comment(text)
                if cleaned_text:  # Only add if the cleaned text is not empty
                    amharic_comments.append({
                        "comment_id": item["id"],
                        "text": cleaned_text,
                        # "author": comment.get("authorDisplayName", "Unknown"),
                        # "published_at": comment["publishedAt"]
                    })

        next_page_token = response.get("nextPageToken")
        if not next_page_token:
            break

    return amharic_comments

In [None]:
def save_comments_to_jsonl(comments, folder, filename):

    os.makedirs(folder, exist_ok=True)

    filepath = os.path.join(folder, filename)
    with open(filepath, "w", encoding="utf-8") as f:
        for comment in comments:
            f.write(json.dumps(comment, ensure_ascii=False) + "\n")
    print(f"Comments saved to {filepath}")

In [None]:
if __name__ == "__main__":
    api_key = "YOUR API KEY"

    video_ids = [
        "peIQN5jueQ0",
        "46cVVbBGbjY",
        "jIqOI662_dA",
        "_YddTPxw_68",
        "VT82OK6n3jw",
        "FsEEU4p5BZk",
        "0aP3FCNYbVc",
        "Lu8Or-WFZsk",
        "2aAU4m_uoTw",
        "rY_yW8QSSuQ",
        "hGMBe6Agq9c",
        "dXdAxoC4Pcc",
        "gV0fb0MGuH0",
        "a93nBqQJFOU",
        "hGMBe6Agq9c",
        "jIqOI662_dA",
        "hc_fCgoC4no",
        "_p2ihJ81Xno",
        "pdGZZ54rmBM",
        "IE2AeGsBszA",
        "JPJsvIdqSIc",
        "ppfP5ZAO-Hg",


    ]

    output_folder = "Amharic_comments"

    for video_id in video_ids:
        print(f"Fetching Amharic comments for video ID: {video_id}")
        amharic_comments = fetch_amharic_comments(video_id, api_key)

        filename = f"comments_{video_id}.jsonl"
        save_comments_to_jsonl(amharic_comments, output_folder, filename)

Fetching Amharic comments for video ID: peIQN5jueQ0
Comments saved to Amharic_comments/comments_peIQN5jueQ0.jsonl
Fetching Amharic comments for video ID: 46cVVbBGbjY
Comments saved to Amharic_comments/comments_46cVVbBGbjY.jsonl
Fetching Amharic comments for video ID: jIqOI662_dA
Comments saved to Amharic_comments/comments_jIqOI662_dA.jsonl
Fetching Amharic comments for video ID: _YddTPxw_68
Comments saved to Amharic_comments/comments__YddTPxw_68.jsonl
Fetching Amharic comments for video ID: VT82OK6n3jw
Comments saved to Amharic_comments/comments_VT82OK6n3jw.jsonl
Fetching Amharic comments for video ID: FsEEU4p5BZk
Comments saved to Amharic_comments/comments_FsEEU4p5BZk.jsonl
Fetching Amharic comments for video ID: 0aP3FCNYbVc
Comments saved to Amharic_comments/comments_0aP3FCNYbVc.jsonl
Fetching Amharic comments for video ID: Lu8Or-WFZsk
Comments saved to Amharic_comments/comments_Lu8Or-WFZsk.jsonl
Fetching Amharic comments for video ID: 2aAU4m_uoTw
Comments saved to Amharic_comments/c

In [None]:
def read_comments_from_folder(folder_path):

    all_comments = []

    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".jsonl"):  # Process only JSONL files
            file_path = os.path.join(folder_path, filename)
            print(f"Reading comments from: {file_path}")

            # Read the JSONL file
            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    comment_data = json.loads(line.strip())
                    all_comments.append(comment_data["text"])  # Extract the comment text

    return all_comments

In [None]:
positive_keywords = [
    "ጥሩ", "አሪፍ", "በጣም ጥሩ", "አመሰግናለሁ","ሙዚቃው ያምራል","አስደሳች ታሪክ","ድንቅ ዳይሬክሽን","የማይረሳ","አስደናቂ ትወና",
    "አስቂኝ","አስደማሚ","ምርጥ","ድንቅ","አስደናቂ","አስደሳች","በርቱ","ፍቅር","በፍቅር","የፍቅር","ማፍቀር","መፈቀር"
     "ጥሩ", "ውብ", "ምርጥ", "ተሳክቷል", "ደስ የሚል", "አስደሳች", "እምነት የሚጣልበት", "ፈጣን", "ቀልጣፋ", "ተመችቷል",
    "ድጋፍ", "ያበረታታል", "ይደግፋል", "እውነት", "ፍትህ", "ሰላም", "ልማት", "ለውጥ", "ጀግና", "አርበኛ",
    "ዴሞክራሲ", "ይሻላል", "ተስፋ", "ድል", "ተስፋ ሰጪ", "አኩሪ", "የሚያነቃቃ", "አስተዋጽኦ", "እድገት", "ብልጽግና",
    "አንድነት", "መተባበር", "ሰላማዊ", "እርቅ", "መቻቻል", "እኩልነት", "ነጻነት", "ልማታዊ", "ተራማጅ", "ድምቀት",
    "ክብር", "አስደናቂ", "ድንቅ", "ጎበዝ", "አስተዋይ", "ትሁት", "ጨዋ", "አሳማኝ", "አነቃቂ", "አርአያ",
    "ተደማጭነት", "ቅንነት", "ትጋት", "አስደሳች", "አስደማሚ", "አስቂኝ", "አስደናጋጭ", "የማይረሳ", "ያምራል", "ይበልጣል",
    "ይፈውሳል", "ያድናል", "ያግዛል", "የተባረከ", "ያበራል", "ያበለጽጋል", "ያከብራል", "ያሞግሳል", "የተመሰገነ",
    "የተወደደ", "የተከበረ", "የተወደሰ", "የከበረ", "ድንቅ ስራ", "ፍጹም", "ድንቅ ችሎታ", "ብሩህ አእምሮ", "ጥሩ ምግባር",
    "መልካም ልብ", "ጽኑ እምነት", "አዎንታዊ", "ቆንጆ", "ሠናይ", "ያማረ", "የከበረ", "ትልቅ", "ታላቅ", "ድንቅ ሐሳብ",
    "ትክክል", "የተሳካ", "ጥሩ ውጤት", "በረከት", "ምስጋና", "ክብር ይሁን", "ይባረክ", "ያድግ", "ይለምልም", "ይበልጽግ",
    "ይዳብር", "ይስፋፋ", "ይስመር", "ይፈጸም", "ይከናወን", "ይፈጸምልን", "ይከናወንልን", "ይሳካልን", "ይስመርልን", "ይዳብርልን",
    "ይለምልምልን", "ይበልጽግልን", "ይስፋፋልን", "የተወደደ ይሁን", "የተከበረ ይሁን", "የተመሰገነ ይሁን", "ያንጸባርቃል", "ያበራልናል", "ያበለጽገናል",
    "ያከብረናል", "ያሞግሰናል", "ያግዘናል", "ያድነናል", "ይፈውሰናል", "ያድጋልናል", "የተባረከ ይሁንልን", "ያማረ ይሁንልን", "ሠናይ ይሁንልን",
    "ቆንጆ ይሁንልን", "አዎንታዊ ይሁንልን", "ብሩህ ይሁንልን", "ፍጹም ይሁንልን", "መልካም ይሁንልን", "ትክክል ይሁንልን", "የተሳካ ይሁንልን", "ታላቅ ይሁንልን",
    "ትልቅ ይሁንልን", "የከበረ ይሁንልን", "ያማረ ሐሳብ", "ድንቅ ችሎታ", "ጥሩ ምግባር", "መልካም ልብ", "ጽኑ እምነት", "ያድናልናል", "ያግዘናልናል",
    "ይፈውሰናልናል", "ያበራልናልናል", "ያበለጽገናልናል", "ያከብረናልናል", "ያሞግሰናልናል", "ይበልጣልናልናል", "ይሻላልናልናል", "ያበረታታልናልናል", "ያነቃቃልናልናል",
    "ያድጋልናልናል", "ይለምልማልናልናል", "ይበልጽጋልናልናል", "ይዳብራልናልናል", "ይስፋፋልናልናል", "ይስመራልናልናል", "ይፈጸማልናልናል", "ይከናወናልናልናል", "ይሳካልናልናል",
    "ይስመራልናልናል", "ይዳብራልናልናል", "ይለምልማልናልናል", "ይበልጽጋልናልናል", "ይስፋፋልናልናል", "ይፈጸማልናልናል", "ይከናወናልናልናል", "ይሳካልናልናል",
    "ይስመራልናልናል", "ይዳብራልናልናል", "ይለምልማልናልናል", "ይበልጽጋልናልናል", "ይስፋፋልናልናል", "ይፈጸማልናልናል", "ይከናወናልናልናል", "ይሳካልናልናል",
    "ይስመራልናልናል", "ይዳብራልናልናል", "ይለምልማልናልናል", "ይበልጽጋልናልናል", "ይስፋፋልናልናል", "ይፈጸማልናልናል", "ይከናወናልናልናል", "ይሳካልናልናል",
    "ይስመራልናልናል", "ይዳብራልናልናል", "ይለምልማልናልናል", "ይበልጽጋልናልናል", "ይስፋፋልናልናል", "ይፈጸማልናልናል", "ይከናወናልናልናል", "ይሳካልናልናል",
    "ይስመራልናልናል", "ይዳብራልናልናል", "ይለምልማልናል",""
    ]
negative_keywords = [
    "መጥፎ","እየዘረፈ","ዘረፈ","ገደለ","እየገደለ", "አልወደድኩም", "አስቸጋሪ", "አልቻልኩም","አሰልቺ","የሚያበሳጭ","ደካማ" "ታሪክ",
    "አስመሳይ ትወና","አስከፊ ዳይሬክሽን","የማይረባ","አሳዛኝ","ጊዜ ማባከን","ድክመት","አስፈሪ","ይረሸኑ","መስረቅ","ሰረቀ","በሰበሰ","መበሰበሰ","አበሰበሰ",
        "መጥፎ", "አስከፊ", "ዘገየ", "ችግር", "ተበላሽቷል", "አልተሳካም", "አሳዛኝ", "ያሳዝናል", "አይደለም", "አልወደድኩትም",
    "ተቃውሞ", "ይቃወማል", "ውሸት", "ግፍ", "ጦርነት", "ድህነት", "ሙስና", "አምባገነን", "ጨቋኝ", "ዘረኝነት",
    "ትርምስ", "ይከፋል", "ተስፋ መቁረጥ", "ውድቀት", "አደጋ", "ግጭት", "ሽብር", "አመጽ", "ዝርፊያ", "ግድያ",
    "እልቂት", "ዘር ማጥፋት", "ወንጀል", "ብጥብጥ", "አለመረጋጋት", "አፈና", "አድልዎ", "አድማ", "ትችት", "ውግዘት",
    "ውርደት", "ኋላቀርነት", "ትዕቢተኛ", "እብሪተኛ", "አላዋቂ", "ደካማ", "አሰልቺ", "አሳፋሪ", "ውሸታም", "አጭበርባሪ",
    "ትችት የሚበዛበት", "ጉድለት", "ቸልተኛ", "የሚያበሳጭ", "አስመሳይ", "አስፈሪ", "የማይረባ", "ጊዜ ማባከን", "ድክመት", "ይጎዳል",
    "ያጠፋል", "ይገድላል", "ይጎዳልናል", "ያጠፋልናል", "ይገድላልናል", "ይጎዳልናልናል", "ያጠፋልናልናል", "ይገድላልናልናል", "ጭካኔ", "ክፋት",
    "መጥፎ ዕድል", "መከራ", "ሐዘን", "ቁጣ", "ጥላቻ", "ንቀት", "ንቀት የተሞላበት", "ተንኮል", "ተንኮለኛ", "ሐሰት",
    "ውሸት የተሞላበት", "ማታለል", "ማጭበርበር", "አሳሳች", "አታላይ", "አታላይነት", "አሳሳችነት", "አታላይ ተግባር", "አሳሳች ተግባር",
    "አሳፋሪ ተግባር", "አሳዛኝ ሁኔታ", "አሳዛኝ ዜና", "አስጨናቂ", "አስጨናቂ ሁኔታ", "አስጨናቂ ዜና", "ያሳዝናልናል",
    "ተወው","ተወኝ","መተው","አስጸያፊ","አስከፊ","አስከፊነት","አስቸጋሪ","አስቂኝ","አስቂኝነት","ጥፋት","አስጸያፊ","ክስ መነሳት",
    "ክስ","እርግማን","ህመም","ህመም ያለው","ጉስቁልኝ","ጉስቁልነት","ጠላት","ጠላኝ","ችግር","መከራ","መከራና ችግር","መከራ አስከባሪ",
    "አስቸጋሪ","አስቸጋሪነት","አስቸጋሪ አመለካከት","አስቸጋሪነት","አስቸጋሪ ሁኔታ","ማስፈራራት","ማስቀመጥ","ቁጣ","ቁጣማ","ማስቆጣት",
    "አስቆጣሪነት","አስቆጣሪ","ጠላኝነት","ጠላትነት","ጠላኝነት ያለው","ጭንቀት","ጭንቀት ያለው","አስፈሪ","አስቀራርቅ","አስቀራርቅ ያለው",
    "ኩራት","ኩራትነት","አሳፋሪ","አሳፋሪነት","አስፈሪ","አስፈሪነት","እምቢነት", "እምነት የሌለው", "ውሸታም", "ውሸት", "ስህተት",
    "ራብ", "ድካም", "ፍርሃት", "ፍርሃታማ", "ደካማ", "ጠብ", "ቆሻሻ", "ጉድለት ያለው", "ሞኝ", "ሞኝነት", "መከልከል", "የተከለከለ",
    "ማስታወስ የማይችል", "ተወላለፈ", "ስብራት", "ቀጭን", "ማጭበርበር", "ፍርሃት", "ማስፈራራት", "የተፈራ", "ሚያስፈራ", "አስቀያሚ", "ማስቸገር"
    ]

In [None]:
def label_comment(comment):
    """
    Labels a comment as positive, negative, or neutral based on keywords.
    """
    # Check for negative keywords first
    # if any(word in comment for word in negative_keywords):
    #     print(f"Matched negative keyword in comment: {comment}")
    #     return "negative"

    # # Check for positive keywords
    # elif any(word in comment for word in positive_keywords):
    #     print(f"Matched positive keyword in comment: {comment}")
    #     return "positive"

    for word in negative_keywords:
        if f" {word} " in f" {comment} ":
            print(f"Matched negative keyword: {word} in comment: {comment}")
            return "negative"

    # Check for positive keywords
    for word in positive_keywords:
        if f" {word} " in f" {comment} ":
            print(f"Matched positive keyword: {word} in comment: {comment}")
            return "positive"
    # If no keywords match, label as neutral
    else:
        print(f"No keywords matched for comment: {comment}")
        return "neutral"

In [None]:
def save_labeled_data_to_jsonl(labeled_data, output_file):
    """
    Saves labeled data to a JSONL file.

    Args:
        labeled_data (list): A list of dictionaries containing labeled comments.
        output_file (str): The path to the output JSONL file.
    """
    with open(output_file, "w", encoding="utf-8") as f:
        for data in labeled_data:
            f.write(json.dumps(data, ensure_ascii=False) + "\n")
    print(f"Labeled data saved to {output_file}")

In [None]:
if __name__ == "__main__":
    # Path to the folder containing JSONL files
    folder_path = "/content/Amharic_comments"

    # Read comments from the folder
    comments = read_comments_from_folder(folder_path)

    # Label the comments
    labeled_data = [{"comment": comment, "sentiment": label_comment(comment)} for comment in comments]

    # Print the labeled data
    for data in labeled_data:
        print(data)
    # Save the labeled data to a JSONL file
    output_file = "labeled_comments.jsonl"  # Replace with your desired output file path
    save_labeled_data_to_jsonl(labeled_data, output_file)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
{'comment': 'ደስ ብሎኛል', 'sentiment': 'neutral'}
{'comment': 'ወቸው ጉድ እዱሉን ብትሰጡኝ ደስ ይለኛል በጣም ፍላጎት አለኝ ለመማር', 'sentiment': 'neutral'}
{'comment': 'በጣም መማር አፈልጋለሁ ፡፡ለተሻለ የስራ ጥራት ለመስራት ስለፈለግኩ ነው፡፡አክባሪያችሁም ነኝ፡፡', 'sentiment': 'neutral'}
{'comment': '@80,000 የቲክቶክ አለኝ በዚህ ላይ ስልጠና ብጨምርበት የተሻለ ስራ እሰራለሁ። የወቸው ነኝ', 'sentiment': 'neutral'}
{'comment': 'ስሜታዊ አትሁን አቶ እሸቱ', 'sentiment': 'neutral'}
{'comment': '&#39; እና ሮቤል ይልቃል በማንኛውም አጋጣሚ ክፍተት ቢፈጥር', 'sentiment': 'neutral'}
{'comment': 'እናቶችን ልጆቻቸውን በቀላሉ በቤት ውስጥ የሚያስተምሩበት ለመስራት ለመጀመር ዝግጅት ላይ ነኝ እና እድሉ ለኔ ቢሰጠኝ ይጠቅመኛል አመሰግናለሁ', 'sentiment': 'positive'}
{'comment': 'ወቸው', 'sentiment': 'neutral'}
{'comment': 'ስራ ያስፈልገኛል ለስራ ደሞ እወቀት ያስፈልጋል ለዕዉቀት ደሞ መማር ከዛ እራስን ችሎ መኖር እየሰሩ የደከመዉ ሰዉ ኮመንት', 'sentiment': 'neutral'}
{'comment': 'እሸ የእዉነት ሰዉ ሰላም ይብዛ ብሮ', 'sentiment': 'positive'}
{'comment': 'እሼ በጌታ እኔን አግዙኝ በጣም ያስፈልገኛል የ ተማሪ ነኝ በአሁኑ ሰአት የግድ ያስፈልገኛል ገንዘብ መስራት ይኖርብኛል እሼ በጌታ ይሆንን እድል ለኔ ስጡኝ እየተማርኩም በ

Chunking the label dataset

In [None]:


# Function to save labeled data to a JSONL file
def save_labeled_data_to_jsonl(labeled_data, output_file):
    """
    Saves labeled data to a JSONL file.

    Args:
        labeled_data (list): A list of dictionaries containing labeled comments.
        output_file (str): The path to the output JSONL file.
    """
    with open(output_file, "w", encoding="utf-8") as f:
        for data in labeled_data:
            f.write(json.dumps(data, ensure_ascii=False) + "\n")
    print(f"Labeled data saved to {output_file}")

# Main pipeline
if __name__ == "__main__":
    # Path to the folder containing JSONL files
    folder_path = "Amharic_comments"  # Replace with the actual folder path

    # Read comments from the folder
    comments = read_comments_from_folder(folder_path)

    # Load a pre-trained sentiment analysis pipeline
    sentiment_analyzer = pipeline("sentiment-analysis", model="nlptown/bert-base-multilingual-uncased-sentiment")

    # Perform sentiment analysis
    labeled_data = []
    for comment in comments:
        cleaned_comment = clean_comment(comment)
        sentiment = sentiment_analyzer(cleaned_comment)[0]
        labeled_data.append({"comment": comment, "sentiment": sentiment["label"]})

    # Print the labeled data
    for data in labeled_data:
        print(data)

    # Save the labeled data to a JSONL file
    output_file = "labeled_comments.jsonl"  # Replace with your desired output file path
    save_labeled_data_to_jsonl(labeled_data, output_file)