In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# New Section

In [2]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
# Load the dataset
dfTrain = pd.read_csv("/content/train.csv")
dfTest = pd.read_csv("/content/test.csv")

In [None]:
# Set the word class based on the "Class" column
#dfTrain["Word Class"] = (dfTrain["Class"] == "normal").astype(int)
#dfTest["Word Class"] = (dfTest["Class"] == "normal").astype(int)
dfTrain

Unnamed: 0,Tweet,Class
0,وزير الخارجية اللبناني جبران باسيل قال في سلسل...,normal
1,سورية بلد الحضارات تربطها بعلية او بحيوان,normal
2,اخي الحاج اذا شعرت انك محرجا من الانتقادات لتص...,normal
3,ما فيك تعيش بلا ما تكب فتن ليل نهار وبكرة قلهم...,normal
4,هذا البطل الذي قاتل وجاذف بحياته لتحيا انت يا ...,abusive
...,...,...
4671,كول هوا مرة تانيي وحلوا عن طيزو وطيزنا ومقلعين...,abusive
4672,رئيس روحي؟ تروح روحك انت وكل مين شدّ عمشدّك مش...,abusive
4673,إذا أرادت إسرائيل أن تضمن أمنها وهو حق عليها ا...,normal
4674,خليك بحالك يا نعيمي على أساس أنت مش مرتزق و طب...,abusive


In [None]:
# Define the stop words
stop_words = set(stopwords.words('arabic'))

# Apply tokenization and remove stop words
dfTrain['Tweet'] = dfTrain['Tweet'].apply(lambda x: [token for token in word_tokenize(x) if token.lower() not in stop_words])
dfTest['Tweet'] = dfTest['Tweet'].apply(lambda x: [token for token in word_tokenize(x) if token.lower() not in stop_words])

In [None]:
# Convert the tweets into strings
dfTrain['Tweet'] = dfTrain['Tweet'].apply(lambda x: ' '.join(x))
dfTest['Tweet'] = dfTest['Tweet'].apply(lambda x: ' '.join(x))

In [None]:
x_train = dfTrain['Tweet']
y_train = dfTrain['Class']

x_test = dfTest['Tweet']
y_test = dfTest['Class']

In [None]:
# Create a TF-IDF vectorizer with n-gram
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
x_train_vectorized = vectorizer.fit_transform(x_train)
x_test_vectorized = vectorizer.transform(x_test)

In [None]:
# Hyperparameters grid for SVC
param_grid = {
    'C': [0.1, 1, 10, 100],
    'gamma': [1, 0.1, 0.01, 0.001],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

In [None]:
# Grid search for SVC
#grid_search = GridSearchCV(SVC(), param_grid, cv=5)
#grid_search.fit(x_train_vectorized, y_train)

#we run once time to get best hyperparameter

best_model = SVC(C=10, gamma=1, kernel='sigmoid')
best_model.fit(x_train_vectorized,y_train)
# Print the best parameters
print("Best parameters: ", best_model.get_params())

Best parameters:  {'C': 10, 'break_ties': False, 'cache_size': 200, 'class_weight': None, 'coef0': 0.0, 'decision_function_shape': 'ovr', 'degree': 3, 'gamma': 1, 'kernel': 'sigmoid', 'max_iter': -1, 'probability': False, 'random_state': None, 'shrinking': True, 'tol': 0.001, 'verbose': False}


In [None]:
# Predict the labels for the test set
y_pred = best_model.predict(x_test_vectorized)

# Evaluate the classifier
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     abusive       0.69      0.64      0.66       346
        hate       0.65      0.32      0.43        94
      normal       0.83      0.91      0.86       730

    accuracy                           0.78      1170
   macro avg       0.72      0.62      0.65      1170
weighted avg       0.77      0.78      0.77      1170



In [None]:
# Preprocess the new text
new_text = ""
new_text = [token for token in word_tokenize(new_text) if token.lower() not in stop_words]
new_text = ' '.join(new_text)

# Vectorize the new text using the same vectorizer
new_text_vectorized = vectorizer.transform([new_text])

# Make the prediction
prediction = best_model.predict(new_text_vectorized)

# Convert the prediction to the corresponding class label
# class_label = "normal" if prediction[0]==1 else "abusive/hate"
class_label = ''.join(prediction)

print("Predicted class label:", class_label)

Predicted class label: normal


In [None]:
# Load the CSV file
tweet_df = pd.read_csv('tweets.csv')

# Apply the prediction code to each row and save as a new DataFrame
predicted_labels = []
for index, row in tweet_df.iterrows():
    # Preprocess the new text
    new_text = row['text']
    new_text = [token for token in word_tokenize(new_text) if token.lower() not in stop_words]
    new_text = ' '.join(new_text)

    # Vectorize the new text using the same vectorizer
    new_text_vectorized = vectorizer.transform([new_text])

    # Make the prediction
    prediction = best_model.predict(new_text_vectorized)
    class_label = ''.join(prediction)

    # Convert the prediction to the corresponding class label
    predicted_labels.append(class_label)

# Create a new DataFrame with the predicted labels
predicted_df = pd.DataFrame({'text': tweet_df['text'], 'predicted_label': predicted_labels})

predicted_df.head()

Unnamed: 0,text,predicted_label
0,الاسبوع الثاني من #معسكر_رؤية_الرقمي 🌟\nاخذنا ...,normal
1,اليوم اعطيت المتدربين تاسك Sentiment Analysis ...,normal
2,اليوم الاول 💙💙 #معسكر_رؤية_الرقمي,normal
3,#معسكر_رؤية_الرقمي\nاليوم خرجنا عن الجانب التق...,normal
4,مقتطفات من رحلة روادنا خلال الأسبوع الأول في #...,normal
