# Tweets Classification

## 1. Data Preprocessing

### 1.1 Overview

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns

In [2]:
# News data
df = pd.read_csv('/content/newsDataset.csv')


In [3]:
df.head()

Unnamed: 0,text,label
0,Introducing Silvergate Capital. A California b...,Business
1,"RT @shaandelhite: In the Cambridge lecture, a ...",Business
2,RT @GcbGulf: Financial freedom is now a realit...,Business
3,"RT @GueshMela: "".....But, we are just survivin...",Business
4,RT @GcbGulf: Financial freedom is now a realit...,Business


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8286 entries, 0 to 8285
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    8286 non-null   object
 1   label   8286 non-null   object
dtypes: object(2)
memory usage: 129.6+ KB


In [5]:
df.shape


(8286, 2)

In [6]:
df.isnull().sum()

text     0
label    0
dtype: int64

In [7]:
# Label Encoding
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

labels = df['label']

# fit the encoder to the labels and transform the labels to numerical values
encoded_labels = encoder.fit_transform(labels)
df['label'] = encoded_labels

In [8]:
df['label'].unique()

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

### 2.2 Reformat Tweets 

In [9]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# download required NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

# define a function to filter stop words and perform lemmatization
def preprocess_text(text):
    # convert text to lowercase
    text = text.lower()
    
    # tokenize the text into individual words
    words = nltk.word_tokenize(text)
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    
    # perform lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # join the words back into a single string
    preprocessed_text = ' '.join(words)
    
    return preprocessed_text


import re

def format_tweets(tweet):
    # mask web urls
    # format twitter account
    tweet = re.sub(r"(\s*)(@[\S]+)\b", r'', tweet)
    tweet = re.sub(r"(\s*)(#[\S]+)\b", r'', tweet)
    tweet = re.sub(r"(\s*)(http:[\S]+)\b", r'', tweet)
    tweet = re.sub(r"(\s*)(https:[\S]+)\b", r'', tweet)
    tweet = re.sub("[^A-Za-z -]", " ", tweet)
    tweet = re.sub("&(amp;)+", " ", tweet)
    return tweet.lower()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


In [10]:
df['format_text'] = df['text'].apply(preprocess_text) # filter stop words and lemmetization
df['format_text'] = df['format_text'].apply(format_tweets) # format tweets

df.head()

Unnamed: 0,text,label,format_text
0,Introducing Silvergate Capital. A California b...,0,introducing silvergate capital california ba...
1,"RT @shaandelhite: In the Cambridge lecture, a ...",0,rt shaandelhite cambridge lecture questi...
2,RT @GcbGulf: Financial freedom is now a realit...,0,rt gcbgulf financial freedom reality gulf ...
3,"RT @GueshMela: "".....But, we are just survivin...",0,rt gueshmela surviving honest ...
4,RT @GcbGulf: Financial freedom is now a realit...,0,rt gcbgulf financial freedom reality gulf ...


## 3. ML models


### 3.1 Multinomial Naive Bayes
---



In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report, matthews_corrcoef


# Naive Bayes

X_train, X_test, y_train, y_test = train_test_split(df['format_text'], df['label'], test_size=0.2)

# create a bag-of-words representation of the text data
vectorizer = CountVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# train a Naive Bayes classifier on the training data
classifier = MultinomialNB()
classifier.fit(X_train_vectors, y_train)
# Predict labels for test set
y_pred = classifier.predict(X_test_vectors)

# evaluate the classifier on the testing data
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.98      0.97       225
           1       0.81      0.82      0.82       200
           2       0.88      0.89      0.88       188
           3       0.91      0.92      0.92       201
           4       0.90      0.94      0.92       189
           5       0.94      0.91      0.93       190
           6       0.82      0.81      0.82       208
           7       0.91      0.95      0.93       200
           8       1.00      0.78      0.88        27
           9       1.00      0.43      0.60        30

    accuracy                           0.89      1658
   macro avg       0.91      0.84      0.87      1658
weighted avg       0.89      0.89      0.89      1658



In [19]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import precision_recall_fscore_support as score

# Accuracy score
print('accuracy is',accuracy_score(y_test,y_pred))

# Calculate the MCC for the predictions
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

# Macro-Averaging
macro_accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = score(y_test, y_pred, average='macro')

print("Macro-Averaging Metrics:")
print(f"Accuracy: {macro_accuracy:.2f}")
print(f"Precision: {macro_precision:.2f}")
print(f"Recall: {macro_recall:.2f}")
print(f"F1 Score: {macro_f1:.2f}")

# Micro-Averaging
micro_accuracy = accuracy_score(y_test, y_pred)
micro_precision, micro_recall, micro_f1, _ = score(y_test, y_pred, average='micro')

print("\nMicro-Averaging Metrics:")
print(f"Accuracy: {micro_accuracy:.2f}")
print(f"Precision: {micro_precision:.2f}")
print(f"Recall: {micro_recall:.2f}")
print(f"F1 Score: {micro_f1:.2f}")


accuracy is 0.8932448733413751
MCC: 0.8789406935525124
Macro-Averaging Metrics:
Accuracy: 0.89
Precision: 0.91
Recall: 0.84
F1 Score: 0.87

Micro-Averaging Metrics:
Accuracy: 0.89
Precision: 0.89
Recall: 0.89
F1 Score: 0.89



### 3.2 SVM
---



In [22]:
# SVM
from sklearn.svm import SVC
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['format_text'], df['label'], test_size=0.2)

# Create TF-IDF vectors from text data
vectorizer = TfidfVectorizer(max_features=1000)
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

# Train SVM classifier on train set
classifier = SVC(kernel='linear')
classifier.fit(X_train_vectors, y_train)

# Predict labels for test set
y_pred = classifier.predict(X_test_vectors)


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       213
           1       0.83      0.88      0.85       212
           2       0.82      0.93      0.87       207
           3       0.95      0.88      0.91       201
           4       0.95      0.89      0.92       195
           5       0.96      0.94      0.95       205
           6       0.78      0.85      0.81       182
           7       0.98      0.88      0.93       185
           8       0.87      0.87      0.87        23
           9       1.00      0.91      0.96        35

    accuracy                           0.90      1658
   macro avg       0.91      0.90      0.90      1658
weighted avg       0.91      0.90      0.90      1658



In [23]:
# Accuracy score
print('accuracy is',accuracy_score(y_test,y_pred))

# Calculate the MCC for the predictions
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

# Macro-Averaging
macro_accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = score(y_test, y_pred, average='macro')

print("Macro-Averaging Metrics:")
print(f"Accuracy: {macro_accuracy:.2f}")
print(f"Precision: {macro_precision:.2f}")
print(f"Recall: {macro_recall:.2f}")
print(f"F1 Score: {macro_f1:.2f}")

# Micro-Averaging
micro_accuracy = accuracy_score(y_test, y_pred)
micro_precision, micro_recall, micro_f1, _ = score(y_test, y_pred, average='micro')

print("\nMicro-Averaging Metrics:")
print(f"Accuracy: {micro_accuracy:.2f}")
print(f"Precision: {micro_precision:.2f}")
print(f"Recall: {micro_recall:.2f}")
print(f"F1 Score: {micro_f1:.2f}")


accuracy is 0.9028950542822678
MCC: 0.8902982649042569
Macro-Averaging Metrics:
Accuracy: 0.90
Precision: 0.91
Recall: 0.90
F1 Score: 0.90

Micro-Averaging Metrics:
Accuracy: 0.90
Precision: 0.90
Recall: 0.90
F1 Score: 0.90


### 3.2 CNN

---



In [24]:
# CNN

from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['format_text'], df['label'], test_size=0.2, random_state=42)

# Tokenize text data and pad sequences
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X_train)
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=100, padding='post', truncating='post')
X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=100, padding='post', truncating='post')

# Define CNN model
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=100))
model.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train CNN model
model.fit(X_train_padded, y_train, epochs=10, batch_size=32)

# Evaluate CNN model on test set
y_pred = model.predict(X_test_padded)
y_pred = np.round(y_pred).astype(int).flatten()
print(classification_report(y_test, y_pred))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       202
           1       0.13      1.00      0.23       213
           2       0.00      0.00      0.00       200
           3       0.00      0.00      0.00       190
           4       0.00      0.00      0.00       216
           5       0.00      0.00      0.00       200
           6       0.00      0.00      0.00       182
           7       0.00      0.00      0.00       182
           8       0.00      0.00      0.00        33
           9       0.00      0.00      0.00        40

    accuracy                           0.13      1658
   macro avg       0.01      0.10      0.02      1658
weighted avg       0.02      0.13      0.03      1658



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
# Accuracy score
print('accuracy is',accuracy_score(y_test,y_pred))

# Calculate the MCC for the predictions
mcc = matthews_corrcoef(y_test, y_pred)
print("MCC:", mcc)

# Macro-Averaging
macro_accuracy = accuracy_score(y_test, y_pred)
macro_precision, macro_recall, macro_f1, _ = score(y_test, y_pred, average='macro')

print("Macro-Averaging Metrics:")
print(f"Accuracy: {macro_accuracy:.2f}")
print(f"Precision: {macro_precision:.2f}")
print(f"Recall: {macro_recall:.2f}")
print(f"F1 Score: {macro_f1:.2f}")

# Micro-Averaging
micro_accuracy = accuracy_score(y_test, y_pred)
micro_precision, micro_recall, micro_f1, _ = score(y_test, y_pred, average='micro')

print("\nMicro-Averaging Metrics:")
print(f"Accuracy: {micro_accuracy:.2f}")
print(f"Precision: {micro_precision:.2f}")
print(f"Recall: {micro_recall:.2f}")
print(f"F1 Score: {micro_f1:.2f}")


accuracy is 0.1284680337756333
MCC: 0.0
Macro-Averaging Metrics:
Accuracy: 0.13
Precision: 0.01
Recall: 0.10
F1 Score: 0.02

Micro-Averaging Metrics:
Accuracy: 0.13
Precision: 0.13
Recall: 0.13
F1 Score: 0.13


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# # NN
# import keras
# import numpy
# from tensorflow.keras.optimizers import SGD


# # fix random seed for reproducibility
# seed = 7
# numpy.random.seed(seed)

# # Initialising the ANN
# classifier = Sequential()

# # Adding the input layer and the first hidden
# classifier.add(Dense(10, activation = 'tanh', input_dim = 11))

# # Adding the output layer
# classifier.add(Dense(1, activation = 'sigmoid'))

# # Compiling the ANN
# optimizer = SGD(learning_rate = 0.01, momentum = 0.9)
# classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])