Imports

In [None]:
import os
import numpy as np
import pandas as pd
import requests
from zipfile import ZipFile
from io import BytesIO
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

Auxiliary Functions

In [None]:
def download_glove_embeddings(url, output_path):
    response = requests.get(url)
    with ZipFile(BytesIO(response.content)) as zip_file:
        zip_file.extractall(output_path)

def load_glove_embeddings(file):
    embeddings = {}
    with open(file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.strip().split()
            word = values[0]
            vector = np.array(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

def text_to_embeddings(text, embeddings, embedding_dim):
    words = text.split()
    embeddings_sum = np.zeros(embedding_dim)
    word_count = 0
    for word in words:
        if word in embeddings:
            embeddings_sum += embeddings[word]
            word_count += 1
    if word_count == 0:
        return embeddings_sum
    return embeddings_sum / word_count

Download GloVe

In [None]:
glove_url = 'http://nlp.stanford.edu/data/glove.6B.zip'
output_path = 'glove'
if not os.path.exists(output_path):
    os.makedirs(output_path)
    download_glove_embeddings(glove_url, output_path)

Load and Preprocess Data

In [None]:
data = pd.read_csv('Tweets.csv')
data['text'] = data['text'].str.lower().str.replace('[^\w\s]', '')

  data['text'] = data['text'].str.lower().str.replace('[^\w\s]', '')


Load GloVe Embeddings

In [None]:
glove_file = os.path.join(output_path, 'glove.6B.50d.txt')
embedding_dim = 50
glove_embeddings = load_glove_embeddings(glove_file)

Convert Data to GloVe Embeddings

In [None]:
data['embeddings'] = data['text'].apply(lambda x: text_to_embeddings(x, glove_embeddings, embedding_dim))
X = np.stack(data['embeddings'].values)
y = data['airline_sentiment']


Split Data Train and Test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

Train - Glove

In [None]:
clf = LogisticRegression(random_state=42)
clf.fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Test

In [None]:
y_pred = clf.predict(X_test)


Show Results

In [None]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Confusion Matrix:
 [[2643  125   46]
 [ 545  276   63]
 [ 248   92  354]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.77      0.94      0.85      2814
     neutral       0.56      0.31      0.40       884
    positive       0.76      0.51      0.61       694

    accuracy                           0.75      4392
   macro avg       0.70      0.59      0.62      4392
weighted avg       0.73      0.75      0.72      4392

Accuracy Score: 0.7452185792349727
