In [1]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import gensim.downloader as api

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\meetm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\meetm\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\meetm\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import re
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

contractions = {
    "don't": "do not", "can't": "cannot", "i'm": "i am",
    "it's": "it is", "you're": "you are", "they're": "they are"
}

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|@\w+|#\w+|[^\w\s]", " ", text)  
    text = " ".join([contractions.get(word, word) for word in text.split()])
    tokens = re.findall(r'\b[a-z]+\b', text) 
    return [lemmatizer.lemmatize(w) for w in tokens]

In [4]:
df = pd.read_csv('Tweets.csv', encoding='ISO-8859-1') 
df.columns = df.columns.str.strip()

df = df[['text', 'airline_sentiment']].dropna()

df['tokens'] = df['text'].apply(preprocess)


In [6]:
w2v = api.load("word2vec-google-news-300")

In [7]:
def tweet_to_vec(tokens, w2v_model):
    vecs = [w2v_model[word] for word in tokens if word in w2v_model]
    return np.mean(vecs, axis=0) if vecs else np.zeros(300)

df['vector'] = df['tokens'].apply(lambda x: tweet_to_vec(x, w2v))
X = np.vstack(df['vector'].values)
y = df['airline_sentiment']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
print("Accuracy:", accuracy_score(y_test, clf.predict(X_test)))

Accuracy: 0.7855191256830601


In [9]:
def predict_tweet_sentiment(model, w2v_model, tweet):
    tokens = preprocess(tweet)
    vec = tweet_to_vec(tokens, w2v_model)
    return model.predict([vec])[0]

In [12]:
test_tweets = [
    "@VirginAmerica I can't believe it's already 2025! ‚úàÔ∏è #travel http://virgin.com",
    "I'm LOVING the new in-flight WiFi! :) #win",
    "Ugh... delays again. This is why I don't fly with them anymore.",
    "Just boarded @Delta ‚Äî hope it's better than last time. ü§û",
    "Great service, comfy seats, and friendly staff! Thanks :)",
]

for tweet in test_tweets:
    print("Original:", tweet)
    sentiment=predict_tweet_sentiment(clf, w2v,tweet)
    print(sentiment)
    print("--------")


Original: @VirginAmerica I can't believe it's already 2025! ‚úàÔ∏è #travel http://virgin.com
negative
--------
Original: I'm LOVING the new in-flight WiFi! :) #win
positive
--------
Original: Ugh... delays again. This is why I don't fly with them anymore.
negative
--------
Original: Just boarded @Delta ‚Äî hope it's better than last time. ü§û
negative
--------
Original: Great service, comfy seats, and friendly staff! Thanks :)
positive
--------
