<a href="https://colab.research.google.com/github/nitinrajg/ML-Projects/blob/main/Sentiment_Analysis_with_Kaggle_Twitter_Airline_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Install Libraries and Setup Kaggle API**

In [7]:
# Install Kaggle library
!uv pip install kaggle

# Upload kaggle.json (run this, then click 'Choose Files' to upload from your computer)
from google.colab import files
files.upload()  # Upload kaggle.json here

# Setup Kaggle API (move token to correct directory and set permissions)
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 156ms[0m[0m


Saving kaggle.json to kaggle.json


# **Download Dataset from Kaggle**

In [8]:
# Download the Twitter US Airline Sentiment dataset
!kaggle datasets download -d crowdflower/twitter-airline-sentiment

# Unzip the downloaded file
!unzip twitter-airline-sentiment.zip

Dataset URL: https://www.kaggle.com/datasets/crowdflower/twitter-airline-sentiment
License(s): CC-BY-NC-SA-4.0
Downloading twitter-airline-sentiment.zip to /content
  0% 0.00/2.55M [00:00<?, ?B/s]
100% 2.55M/2.55M [00:00<00:00, 679MB/s]
Archive:  twitter-airline-sentiment.zip
  inflating: Tweets.csv              
  inflating: database.sqlite         


## **Install required libraries**

In [4]:
!uv pip install emoji

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[2mUsing Python 3.11.13 environment at: /usr[0m
[2mAudited [1m1 package[0m [2min 108ms[0m[0m


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# **Import necessary libraries**

In [15]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix # Import classification_report and confusion_matrix
import emoji  # For emoji detection

# **Data Preprocessing and Cleaning Function**

In [13]:
# Data Preprocessing and Cleaning Function
def preprocess_text(text):
    """
    Cleans and preprocesses text: removes noise, stopwords, lemmatizes.
    """
    if pd.isna(text):
        return ""
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    text = text.lower()
    tokens = text.split()
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Improved Emotion Boost Feature Extraction
def extract_emotion_boost(text):
    """
    Detects emojis/emoticons and returns a boost score.
    +0.2 for positive, -0.2 for negative, 0 otherwise.
    """
    positive_emojis = [':)', ':-)', ':D', ';)', ':-D', '😊', '😃', '😁', emoji.emojize(':smile:')]
    negative_emojis = [':(', ':-(', ':/', 'D:', '>:(', '😞', '😡', '😠', emoji.emojize(':frown:')]
    boost = 0.0
    for emo in positive_emojis:
        if emo in text:
            boost += 0.2
    for emo in negative_emojis:
        if emo in text:
            boost -= 0.2
    return boost

# **Load Data, Preprocess, and Train Model**

In [16]:
# Load the downloaded Kaggle Dataset
df = pd.read_csv('Tweets.csv')

# Data Cleaning: Drop irrelevant columns, handle missing labels
df = df[['text', 'airline_sentiment']]
df = df.dropna(subset=['airline_sentiment'])
df['airline_sentiment'] = df['airline_sentiment'].map({'positive': 1, 'negative': 0, 'neutral': 2})

# Preprocess text
df['cleaned_text'] = df['text'].apply(preprocess_text)

# Custom Feature: Emotion Boost
df['emotion_boost'] = df['text'].apply(extract_emotion_boost)

# Feature Engineering: TF-IDF on cleaned text
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform(df['cleaned_text'])

# Combine with custom feature
X = np.hstack((X_tfidf.toarray(), df['emotion_boost'].values.reshape(-1, 1)))
y = df['airline_sentiment']

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Custom Model: Logistic Regression with hybrid features
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predict and Evaluate
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

# Additional Evaluation
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive', 'Neutral']))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Accuracy: 80.46%

Classification Report:
              precision    recall  f1-score   support

    Negative       0.82      0.94      0.88      1889
    Positive       0.83      0.62      0.71       459
     Neutral       0.69      0.50      0.58       580

    accuracy                           0.80      2928
   macro avg       0.78      0.69      0.72      2928
weighted avg       0.80      0.80      0.79      2928

Confusion Matrix:
[[1782   28   79]
 [ 120  286   53]
 [ 260   32  288]]


## **Example Prediction**

In [21]:
# List of 10 sample texts (inspired by real tweets/reviews, including emojis)
sample_texts = [
    "Absolutely love this service! Fast and reliable. :-)",
    "Worst experience ever, delayed flight and rude staff. :(",
    "The product is okay, nothing special about it.",
    "Highly recommend! Great quality and affordable. :D",
    "Disappointed with the delivery time. Not worth it. /:",
    "Neutral opinion: It works as expected, no complaints or praises.",
    "Amazing support team! They fixed my issue quickly. ;)",
    "Terrible quality, broke after one use. >:(",
    "It's fine for the price, but could be better.",
    "Best purchase this year! Exceeded expectations. :-D"
]

# Process each sample
results = []
for text in sample_texts:
    cleaned = preprocess_text(text)
    tfidf_vec = tfidf.transform([cleaned])
    boost = extract_emotion_boost(text)
    sample_X = np.hstack((tfidf_vec.toarray(), np.array([[boost]])))
    pred = model.predict(sample_X)[0]
    sentiment = {1: 'Positive', 0: 'Negative', 2: 'Neutral'}[pred]
    results.append({
        'Original Text': text,
        'Cleaned Text': cleaned,
        'Emotion Boost Score': boost,
        'Predicted Sentiment': sentiment
    })

# Display results in a DataFrame
results_df = pd.DataFrame(results)
print("Results:")
from IPython.display import display
display(results_df)

Results:


Unnamed: 0,Original Text,Cleaned Text,Emotion Boost Score,Predicted Sentiment
0,Absolutely love this service! Fast and reliabl...,absolutely love service fast reliable,0.2,Positive
1,"Worst experience ever, delayed flight and rude...",worst experience ever delayed flight rude staff,-0.2,Negative
2,"The product is okay, nothing special about it.",product okay nothing special,0.0,Negative
3,Highly recommend! Great quality and affordable...,highly recommend great quality affordable,0.2,Positive
4,Disappointed with the delivery time. Not worth...,disappointed delivery time worth,0.0,Negative
5,"Neutral opinion: It works as expected, no comp...",neutral opinion work expected complaint praise,0.0,Negative
6,Amazing support team! They fixed my issue quic...,amazing support team fixed issue quickly,0.2,Positive
7,"Terrible quality, broke after one use. >:(",terrible quality broke one use,-0.4,Negative
8,"It's fine for the price, but could be better.",fine price could better,0.0,Negative
9,Best purchase this year! Exceeded expectations...,best purchase year exceeded expectation,0.2,Positive


In [19]:
from google.colab import auth
auth.authenticate_user()

In [20]:
from google.colab import sheets
sheet = sheets.InteractiveSheet(df=results_df)

https://docs.google.com/spreadsheets/d/1xUHxuqcelASZNgeJDvzM7cSf6_xB76LS3dYWr_5aQhg/edit#gid=0
