## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

import nltk
import warnings
warnings.filterwarnings('ignore')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Load Dataset

In [2]:
col_names = ['target', 'id', 'date', 'flag', 'user', 'text']
df = pd.read_csv('training.1600000.processed.noemoticon.csv',
                 names=col_names,
                 encoding='ISO-8859-1')
df = df.reset_index(drop=True)
df.replace({'target': {4: 1}}, inplace=True)  # تحويل 4 إلى 1
df.head()


Unnamed: 0,target,id,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


## Check for Nulls

In [3]:
df.isnull().sum()

target    0
id        0
date      0
flag      0
user      0
text      0
dtype: int64

## Define Stemming Function

In [4]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+', '', text)  # إزالة الروابط
    text = re.sub(r'@\w+', '', text)     # إزالة mentions
    text = re.sub(r'#\w+', '', text)     # إزالة hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # إزالة الأرقام وعلامات الترقيم
    words = text.split()
    words = [lemmatizer.lemmatize(w) for w in words if w not in stop_words]
    return " ".join(words)

tqdm.pandas(desc="Processing tweets")
df['clean_content'] = df['text'].progress_apply(clean_text)


Processing tweets: 100%|██████████| 1600000/1600000 [00:43<00:00, 36797.30it/s]


## Split Data

In [5]:
x = df['clean_content'].values
y = df['target'].values

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, stratify=y, random_state=2
)

print("Train size:", len(x_train), "Test size:", len(x_test))


Train size: 1280000 Test size: 320000


## Create TF-IDF Vectorizer

In [6]:
vectorizer = TfidfVectorizer(
    max_features=50000,
    ngram_range=(1,2),  # unigrams + bigrams
    stop_words='english'
)

x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)


## Train LinearSVC

In [7]:
model = LinearSVC(C=1)
model.fit(x_train_tfidf, y_train)


In [8]:
y_train_pred = model.predict(x_train_tfidf)
y_test_pred = model.predict(x_test_tfidf)

train_acc = accuracy_score(y_train, y_train_pred)
test_acc = accuracy_score(y_test, y_test_pred)

print("Train Accuracy:", train_acc)
print("Test Accuracy:", test_acc)
print("\nClassification Report:\n", classification_report(y_test, y_test_pred))


Train Accuracy: 0.7985453125
Test Accuracy: 0.778840625

Classification Report:
               precision    recall  f1-score   support

           0       0.79      0.75      0.77    160000
           1       0.77      0.80      0.78    160000

    accuracy                           0.78    320000
   macro avg       0.78      0.78      0.78    320000
weighted avg       0.78      0.78      0.78    320000



## Test on a Single Example

In [12]:
new_tweet = "I am so happy with this product! #loveit"
new_tweet_clean = clean_text(new_tweet)
new_vector = vectorizer.transform([new_tweet_clean])
prediction = model.predict(new_vector)

print("Prediction:", prediction[0])

Prediction: 1
