<a href="https://colab.research.google.com/github/mrsbelema/belema/blob/main/7120_belema_kio.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install necessary packages

In [1]:
!pip install nltk contractions emoji textblob
!pip install scikit-learn
!pip install nltk

Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.9/289.9 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (110 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.7/110.7 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages:

Importing required libraries

In [2]:
import string
import nltk
import re
import emoji
import contractions
from textblob import TextBlob
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import csv

Download necessary NLTK resources

In [3]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

Load the data from file

In [4]:
file_path = '/content/drive/MyDrive/Colab Notebooks/4A-English/SemEval2017-task4-dev.subtask-A.english.INPUT.txt'
data = pd.read_csv(file_path, sep='\t')

Extract texts and labels

In [5]:
texts = data[data.columns[2]].to_numpy()
labels = data[data.columns[1]].to_numpy()

Define function for text cleaning

In [6]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove Twitter handles (usernames)
    text = re.sub(r'@[A-Za-z0-9]+', '', text)

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z\s]', '', text)

    # Convert to lowercase
    text = text.lower()

    return text

# Clean and preprocess texts
cleaned_texts = [clean_text(text) for text in texts]

Tokenization, stopword removal, lemmatization

In [7]:
# Step 2: Tokenization (splitting into words)
tokenized_texts = [nltk.word_tokenize(text) for text in cleaned_texts]

# Step 3: Stopword Removal
stop_words = set(stopwords.words('english'))  # Set of English stopwords
filtered_texts = [[word for word in tokens if word not in stop_words] for tokens in tokenized_texts]

# Step 4: Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_texts = [[lemmatizer.lemmatize(word) for word in tokens] for tokens in filtered_texts]

# Step 5: Joining tokens back into sentences
preprocessed_texts = [' '.join(tokens) for tokens in lemmatized_texts]

Feature Extraction (TF-IDF)

In [8]:
# Define TF-IDF vectorizer with parameters
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1, 3))

# Fit TF-IDF vectorizer on the texts and transform them into TF-IDF vectors
tfidf_vector = tfidf_vectorizer.fit_transform(preprocessed_texts)

Scale the data

In [9]:
scaler = StandardScaler(with_mean=False)
tfidf_vector_scaled = scaler.fit_transform(tfidf_vector)

Data splitting

In [10]:
# Split the data into train and test sets
x_train, x_test, labels_train, labels_test = train_test_split(tfidf_vector.toarray(), labels, test_size=0.2, random_state=42)

Initialize models

In [11]:
models = {
    'Logistic Regression': LogisticRegression(random_state=0, max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(random_state=0),
    'Decision Tree': DecisionTreeClassifier(random_state=0),
    'KNN': KNeighborsClassifier()
}

trained_models = {}

# Train each model
for model_name, model in models.items():
    model.fit(x_train, labels_train)
    trained_models[model_name] = model

Function to train and evaluate a model

In [12]:
# Function to evaluate a model
def evaluate_model(model, x_test, labels_test):
    y_predicted = model.predict(x_test)

    accuracy = accuracy_score(labels_test, y_predicted)
    precision = precision_score(labels_test, y_predicted, average='weighted')
    recall = recall_score(labels_test, y_predicted, average='weighted')
    f1 = f1_score(labels_test, y_predicted, average='weighted')

    classes = ['negative', 'neutral', 'positive']
    class_metrics = {
        'class': classes,
        'precision': precision_score(labels_test, y_predicted, average=None, labels=classes),
        'recall': recall_score(labels_test, y_predicted, average=None, labels=classes),
        'f1': f1_score(labels_test, y_predicted, average=None, labels=classes)
    }

    print(f'\nModel: {type(model).__name__}')
    print(f'Accuracy: {accuracy:.4f}')
    for cls, pre, rec, f1 in zip(class_metrics['class'], class_metrics['precision'], class_metrics['recall'], class_metrics['f1']):
        print(f'{cls.capitalize()} class - Precision: {pre:.4f}, Recall: {rec:.4f}, F1-Score: {f1:.4f}')
    print(f'Macro Average Precision: {precision:.4f}')
    print(f'Macro Average Recall: {recall:.4f}')
    print(f'Macro Average F1-Score: {f1:.4f}')

    return {
        'model': type(model).__name__,
        'accuracy': accuracy,
        'class_metrics': class_metrics
    }

Initialize models

Train and evaluate each model

In [13]:
results = []
for model_name, model in trained_models.items():
    result = evaluate_model(model, x_test, labels_test)
    results.append(result)


Model: LogisticRegression
Accuracy: 0.6620
Negative class - Precision: 0.6644, Recall: 0.3133, F1-Score: 0.4258
Neutral class - Precision: 0.6447, Recall: 0.8159, F1-Score: 0.7203
Positive class - Precision: 0.7004, Recall: 0.5887, F1-Score: 0.6397
Macro Average Precision: 0.6666
Macro Average Recall: 0.6620
Macro Average F1-Score: 0.6397

Model: MultinomialNB
Accuracy: 0.6249
Negative class - Precision: 0.6975, Recall: 0.1788, F1-Score: 0.2846
Neutral class - Precision: 0.6067, Recall: 0.8312, F1-Score: 0.7014
Positive class - Precision: 0.6621, Recall: 0.5172, F1-Score: 0.5807
Macro Average Precision: 0.6394
Macro Average Recall: 0.6249
Macro Average F1-Score: 0.5807

Model: RandomForestClassifier
Accuracy: 0.6363
Negative class - Precision: 0.6376, Recall: 0.2199, F1-Score: 0.3271
Neutral class - Precision: 0.6195, Recall: 0.8379, F1-Score: 0.7123
Positive class - Precision: 0.6803, Recall: 0.5222, F1-Score: 0.5909
Macro Average Precision: 0.6429
Macro Average Recall: 0.6363
Macro 

Save Results to CSV

In [None]:
with open('table_metrics.csv', 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Model', 'Class', 'Precision', 'Recall', 'F1-Score'])
    for result in results:
        for cls, pre, rec, f1 in zip(result['class_metrics']['class'], result['class_metrics']['precision'], result['class_metrics']['recall'], result['class_metrics']['f1']):
            writer.writerow([result['model'], cls, pre, rec, f1])
        writer.writerow([result['model'], 'Macro Average', result['accuracy'], '', ''])