In [4]:
import pandas as pd
import numpy as np
import re
import string
import nltk
import gensim.downloader as api
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from contractions import fix

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

df = pd.read_csv("Tweets.csv")
df = df[['airline_sentiment', 'text']]
df = df.dropna()

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www.\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = fix(text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalpha() and word not in stop_words]
    return tokens

print("Loading Word2Vec model (Google News 300D)...")
w2v_model = api.load("word2vec-google-news-300")

def vectorize_tweet(tweet, model):
    words = preprocess(tweet)
    vectors = [model[word] for word in words if word in model]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

X = np.array([vectorize_tweet(tweet, w2v_model) for tweet in df['text']])
y = df['airline_sentiment'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

def predict_tweet_sentiment(model, w2v_model, tweet):
    vec = vectorize_tweet(tweet, w2v_model).reshape(1, -1)
    pred = model.predict(vec)[0]
    return pred

another_tweet = "I'm so frustrated with United Airlines. Worst customer service ever."
print("Predicted Sentiment:", predict_tweet_sentiment(clf, w2v_model, another_tweet))


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Loading Word2Vec model (Google News 300D)...




Test Accuracy: 0.7687841530054644
Predicted Sentiment: negative
