In [12]:
from googleapiclient.discovery import build
import pandas as pd
import time

# Initialize YouTube API
youtube = build('youtube', 'v3', developerKey='***************************************')

def split_list(lst, n):
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def fetch_replies(comment_id, video_id):
    replies = []
    try:
        request = youtube.comments().list(part="snippet", parentId=comment_id, maxResults=50)
        while request:
            response = request.execute()
            for item in response.get("items", []):
                reply = item["snippet"]
                replies.append({
                    "video_id": video_id,
                    "comment_id": comment_id,
                    "reply_id": item["id"],
                    "text": reply["textDisplay"],
                    "author": reply["authorDisplayName"],
                    "published_at": reply["publishedAt"],
                    "like_count": reply["likeCount"]
                })
            request = youtube.comments().list_next(request, response)
            time.sleep(1)
    except Exception as e:
        print(f"Error fetching replies for comment {comment_id}: {e}")
    return replies

def fetch_comments(video_id):
    comments = []
    try:
        request = youtube.commentThreads().list(part="snippet", videoId=video_id, maxResults=100)
        while request:
            response = request.execute()
            for item in response.get("items", []):
                comment = item["snippet"]["topLevelComment"]["snippet"]
                comments.append({
                    "video_id": video_id,
                    "comment_id": item["id"],
                    "text": comment["textDisplay"],
                    "author": comment["authorDisplayName"],
                    "published_at": comment["publishedAt"],
                    "like_count": comment["likeCount"],
                    "reply_count": item["snippet"]["totalReplyCount"]
                })
                if item["snippet"]["totalReplyCount"] > 0:
                    comments.extend(fetch_replies(item["id"], video_id))
            request = youtube.commentThreads().list_next(request, response)
            time.sleep(1)
    except Exception as e:
        print(f"Error fetching comments for video {video_id}: {e}")
    return comments

def fetch_video_details(video_ids):
    video_details = []
    for chunk in split_list(video_ids, 50):
        video_ids_str = ','.join(chunk)
        response = youtube.videos().list(part="snippet", id=video_ids_str).execute()
        video_details.extend(response.get('items', []))
    return video_details

def search_videos(query="تأثير الذكاء الاصطناعي على فرص العمل", max_results=100):
    try:
        videos = []
        video_ids = []
        request = youtube.search().list(part="id,snippet", q=query, maxResults=min(max_results, 50), type="video")

        while request and len(video_ids) < max_results:
            response = request.execute()
            for item in response.get('items', []):
                video_id = item['id']['videoId']
                video_title = item['snippet']['title']
                video_ids.append(video_id)
                videos.append({"id": video_id, "title": video_title, "category_name": "Fetching..."})
                if len(video_ids) >= max_results:
                    break
            request = youtube.search().list_next(request, response)

        print(f"Found {len(video_ids)} videos.")

        if video_ids:
            video_details = fetch_video_details(video_ids)
            for video in videos:
                video_detail = next((item for item in video_details if item['id'] == video['id']), None)
                if video_detail:
                    video['category_name'] = "اجتماعي"

        video_ids_df = pd.DataFrame(videos)
        return video_ids_df

    except Exception as e:
        print(f"Error searching videos: {e}")
        return pd.DataFrame()

def fetch_all_comments(video_ids, target_count=6000):
    all_comments = []
    seen = set()

    for idx, video_id in enumerate(video_ids):
        print(f"Fetching comments for video {idx+1}/{len(video_ids)}: {video_id}")
        comments = fetch_comments(video_id)

        for comment in comments:
            key = (comment["text"], comment["author"], comment["published_at"])
            if key not in seen:
                seen.add(key)
                all_comments.append(comment)

        print(f"Collected {len(all_comments)} unique comments so far.")
        if len(all_comments) >= target_count:
            break

    return all_comments

# Run the full process
video_ids_df = search_videos(max_results=100)
video_ids = video_ids_df['id'].tolist()

if not video_ids:
    raise ValueError("Failed to retrieve video IDs.")

comments_data = fetch_all_comments(video_ids, target_count=6000)

df_comments = pd.DataFrame(comments_data)
df_comments.drop_duplicates(subset=["text", "author", "published_at"], inplace=True)
df_comments = df_comments.head(6000)
df_comments.to_excel("dataset.xlsx", index=False)
print("✅ Finished. Dataset saved as dataset.xlsx with", len(df_comments), "unique comments.")


Found 100 videos.
Fetching comments for video 1/100: rOcT2cySbdk
Collected 0 unique comments so far.
Fetching comments for video 2/100: i42gPtjXnKU
Collected 59 unique comments so far.
Fetching comments for video 3/100: c6hQ9DcNPyA
Collected 59 unique comments so far.
Fetching comments for video 4/100: GWnV2zUtIrQ
Collected 60 unique comments so far.
Fetching comments for video 5/100: TJCnKwSc34M
Collected 68 unique comments so far.
Fetching comments for video 6/100: 3Py4uJ6BTP8
Collected 71 unique comments so far.
Fetching comments for video 7/100: F7kvnJwe_5Y
Collected 71 unique comments so far.
Fetching comments for video 8/100: EYDTuI1grp8
Collected 81 unique comments so far.
Fetching comments for video 9/100: YTJ8HQJLf7Q
Collected 81 unique comments so far.
Fetching comments for video 10/100: ff0x8u9s3No
Collected 81 unique comments so far.
Fetching comments for video 11/100: M6Wk3Yj6MU8
Collected 85 unique comments so far.
Fetching comments for video 12/100: CyqFqEqXkXk
Collected



Collected 517 unique comments so far.
Fetching comments for video 36/100: AeQZuT8XqN8
Error fetching comments for video AeQZuT8XqN8: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId=AeQZuT8XqN8&maxResults=100&key=AIzaSyDs0o4CML9Jze3dgnHxwpXRQq8MIueFV0w&alt=json returned "The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.". Details: "[{'message': 'The video identified by the <code><a href="/youtube/v3/docs/commentThreads/list#videoId">videoId</a></code> parameter has disabled comments.', 'domain': 'youtube.commentThread', 'reason': 'commentsDisabled', 'location': 'videoId', 'locationType': 'parameter'}]">
Collected 517 unique comments so far.
Fetching comments for video 37/100: 3ycGxnt1ofM
Collected 528 unique comments so far.
Fetching comments for video 38/100: KXUaqM6enhI
Collected 528 unique comments so far.
Fetching comments for video 

In [13]:
df=df_comments
df.head()

Unnamed: 0,video_id,comment_id,text,author,published_at,like_count,reply_count,reply_id
0,i42gPtjXnKU,Ugx_Q8PlXv97CVWcMIx4AaABAg,الموسيقى أعلى من صوت المذيعة,@ahmedmohsen6737,2025-05-15T10:29:22Z,0,0.0,
1,i42gPtjXnKU,UgyGhgu6v4J4CRpWAH14AaABAg,علشان الذكاء الإصطناعي يحل محل الإنسان لازم يب...,@LionheartedProGamer,2025-04-18T18:22:41Z,0,0.0,
2,i42gPtjXnKU,Ugx03OLTGdO__aX7QrJ4AaABAg,هل الذكاء الإصطناعي يمكنه التغلب على سوق الدعا...,@Qusaivonscham,2025-04-12T03:55:07Z,0,0.0,
3,i42gPtjXnKU,UgwFLW4Mkaw3CLt7hRV4AaABAg,كم بعد ابتعدت عن الله بسبب عرض النساء المتبرجا...,@عليمهديحسين-ت5ح,2025-03-20T19:46:41Z,0,0.0,
4,i42gPtjXnKU,UgyH3VDFYSrXp6xp0pZ4AaABAg,هذا ما يسمى صناعة المستبل وتوجيهه حسب رغبات ال...,@Azawadtube,2025-03-09T10:58:44Z,0,0.0,


In [14]:
df.drop(columns=['video_id','comment_id','reply_count','author','published_at','like_count','reply_id'],inplace=True)

In [15]:
df.head()

Unnamed: 0,text
0,الموسيقى أعلى من صوت المذيعة
1,علشان الذكاء الإصطناعي يحل محل الإنسان لازم يب...
2,هل الذكاء الإصطناعي يمكنه التغلب على سوق الدعا...
3,كم بعد ابتعدت عن الله بسبب عرض النساء المتبرجا...
4,هذا ما يسمى صناعة المستبل وتوجيهه حسب رغبات ال...


In [16]:
df.tail()

Unnamed: 0,text
5995,انت كده عسل وحلو يعني
5996,انا شفت نفس المقطع فين قبل كده🤔<br>كل صناع الم...
5997,@naiseok2875 <br>سواء هي ولا هو مش هتفرق لان ا...
5998,لسه كنت هقول كده
5999,كلامك صح جدا


# **Extracts Arabic words from each text in the DataFrame by keeping only Arabic characters.**

In [17]:
import re
def extract_arabic(text):
    if not isinstance(text, str):
        return ""
    arabic_words = re.findall(r'[\u0600-\u06FF]+', text)
    return ' '.join(arabic_words)
df['text'] = df['text'].apply(extract_arabic)


In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    6000 non-null   object
dtypes: object(1)
memory usage: 47.0+ KB


# **Removes rows where the Arabic text contains only one word (to avoid keeping short or possibly offensive words(it's our choice !).**

In [19]:
df = df[df['text'].str.split().str.len() > ]


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5262 entries, 0 to 5999
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5262 non-null   object
dtypes: object(1)
memory usage: 82.2+ KB


# **EXPORTING**

In [23]:
df.to_excel('cleaned_dataset.xlsx', index=False)

# **MACHINE LEARNING !!!**

**Part I - Data Loading and Preprocessing✅**

**Part II - Train-Test Split & TF-IDF Vectorization**

In [24]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# 📥 Load Dataset

In [28]:
df = pd.read_excel("cleaned_dataset (1).xlsx")


In [29]:
df.head()

Unnamed: 0,text,label
0,الموسيقى أعلى من صوت المذيعة,neutral
1,علشان الذكاء الإصطناعي يحل محل الإنسان لازم يب...,neutral
2,هل الذكاء الإصطناعي يمكنه التغلب على سوق الدعا...,neutral
3,كم بعد ابتعدت عن الله بسبب عرض النساء المتبرجا...,negative
4,هذا ما يسمى صناعة المستبل وتوجيهه حسب رغبات ال...,negative


# 🏷️ Encode labels

In [30]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])

# 📊 Train-Test Split

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label_encoded'], test_size=0.2, random_state=42
)

# ⚠️ Vectorization

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1, 2), max_features=7000)
X_train = tfidf.fit_transform(X_train)
X_test = tfidf.transform(X_test)

# **Part III - Model Training & Evaluation**

In [33]:
#Model Initialization
nb_model         = GaussianNB()
nn_model         = MLPClassifier(hidden_layer_sizes=(100,), activation='logistic', solver='adam', max_iter=500)
linear_svm_model = SVC(kernel='linear')
rbf_svm_model    = SVC(kernel='rbf')
poly_svm_model   = SVC(kernel='poly', degree=2)
sgd_svm_model    = SVC(kernel='sigmoid')

#Naive Bayes needs dense input
X_train_dense = X_train.toarray()
X_test_dense = X_test.toarray()

# Model Training
nb_model.fit(X_train_dense, y_train)
nn_model.fit(X_train, y_train)
linear_svm_model.fit(X_train, y_train)
rbf_svm_model.fit(X_train, y_train)
poly_svm_model.fit(X_train, y_train)
sgd_svm_model.fit(X_train, y_train)

#   Model Prediction
nb_pred         = nb_model.predict(X_test_dense)
nn_pred         = nn_model.predict(X_test)
linear_svm_pred = linear_svm_model.predict(X_test)
rbf_svm_pred    = rbf_svm_model.predict(X_test)
poly_svm_pred   = poly_svm_model.predict(X_test)
sgd_svm_pred    = sgd_svm_model.predict(X_test)

# Evaluation
print("Naive Bayes:\n", classification_report(y_test, nb_pred, target_names=label_encoder.classes_))
print("Neural Net:\n", classification_report(y_test, nn_pred, target_names=label_encoder.classes_))
print("Linear SVM:\n", classification_report(y_test, linear_svm_pred, target_names=label_encoder.classes_))
print("RBF SVM:\n", classification_report(y_test, rbf_svm_pred, target_names=label_encoder.classes_))
print("Poly SVM:\n", classification_report(y_test, poly_svm_pred, target_names=label_encoder.classes_))
print("Sigmoid SVM:\n", classification_report(y_test, sgd_svm_pred, target_names=label_encoder.classes_))


Naive Bayes:
               precision    recall  f1-score   support

    negative       0.66      0.55      0.60       477
     neutral       0.28      0.45      0.34       177
    positive       0.78      0.73      0.75       470

    accuracy                           0.61      1124
   macro avg       0.57      0.57      0.56      1124
weighted avg       0.65      0.61      0.62      1124

Neural Net:
               precision    recall  f1-score   support

    negative       0.70      0.73      0.72       477
     neutral       0.39      0.42      0.40       177
    positive       0.82      0.76      0.79       470

    accuracy                           0.69      1124
   macro avg       0.64      0.64      0.64      1124
weighted avg       0.70      0.69      0.70      1124

Linear SVM:
               precision    recall  f1-score   support

    negative       0.66      0.89      0.76       477
     neutral       0.54      0.28      0.37       177
    positive       0.89      0.73  

# **Saving Linear SVM model (it has the highest score )✅**

In [35]:
with open("bestModel.pickle", "wb") as f:
    pickle.dump(linear_svm_model, f)

print("Best model (Linear SVM) pipeline saved as bestModel.pickle")

Best model (Linear SVM) pipeline saved as bestModel.pickle
