In [None]:
# Install required libraries
!pip install -q openai xgboost wordcloud seaborn


In [None]:
!pip install -q sentence-transformers


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from wordcloud import WordCloud

import google.generativeai as genai



In [None]:
from google.colab import files
import os

# Trigger upload if file doesn't exist
if not os.path.exists('Tweets.csv'):
    files.upload()

# Read the correct file name
df = pd.read_csv("Tweets.csv")
df.head()

In [None]:
!kaggle datasets download -d yasserh/twitter-tweets-sentiment-dataset
!unzip twitter-tweets-sentiment-dataset.zip



In [None]:
df = pd.read_csv("Tweets.csv")
df.head()

In [None]:
sns.countplot(x='sentiment', data=df)
plt.title("Sentiment Distribution")
plt.show()


In [None]:
df['text_length'] = df['text'].astype(str).apply(len)

sns.histplot(df['text_length'], bins=50)
plt.title("Tweet Length Distribution")
plt.show()


In [None]:
text = " ".join(df['text'].astype(str))
wordcloud = WordCloud(width=800, height=400).generate(text)

plt.figure(figsize=(10,5))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()


In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^\w\s]", "", text)
    return text

df['clean_text'] = df['text'].apply(clean_text)



In [None]:
genai.configure(api_key="AIzaSyCfgZPZdiervG54SOqcp_Lm1GoSW7nB5RA")


In [None]:
model = "models/embedding-001"

In [None]:
def get_embeddings_batch(text_list):
    response = genai.embed_content(
        model="models/gemini-embedding-001",
        content=text_list,
        task_type="classification"
    )
    return response["embedding"]

In [None]:
def get_embeddings_batch(text_list):
    response = genai.embed_content(
        model='models/gemini-embedding-001',
        content=text_list,
        task_type='classification'
    )
    return response['embedding']


batch_size = 100
embeddings = []

texts = df['clean_text'].tolist()

print(f'Starting embedding generation for {len(texts)} rows...')

for i in range(0, len(texts), batch_size):
    batch = texts[i:i+batch_size]
    try:
        batch_embeddings = get_embeddings_batch(batch)
        embeddings.extend(batch_embeddings)
        print(f'Processed batch starting at index {i}')
    except Exception as e:
        print(f'Error at batch {i}: {e}')
        break

X = np.array(embeddings)

le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

print("X shape:", X.shape)
print("y shape:", y.shape)


In [None]:
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])


In [None]:
print("Listing models that support 'embedContent':")
for m in genai.list_models():
    if 'embedContent' in m.supported_generation_methods:
        print(f"Model Name: {m.name}")
        print(f"Display Name: {m.display_name}")
        print(f"Supported methods: {m.supported_generation_methods}\n")

In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

X = model.encode(
    df['clean_text'].tolist(),
    batch_size=64,
    show_progress_bar=True
)

le = LabelEncoder()
y = le.fit_transform(df['sentiment'])

print("X shape:", X.shape)
print("y shape:", y.shape)



In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
print("X shape:", X.shape)
print("y shape:", y.shape)
print("X_train shape:", X_train.shape)

In [None]:
clf = LogisticRegression(max_iter=2000)
clf.fit(X_train, y_train)


In [None]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))


In [None]:
cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(6,5))
sns.heatmap(cm, annot=True, fmt='d',
            xticklabels=le.classes_,
            yticklabels=le.classes_)
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity([X[0]], [X[1]])
print("Cosine Similarity:", similarity)


In [None]:
custom_tweets = [
    "I absolutely love this phone!",
    "This is the worst service ever.",
    "It's fine, nothing special.",
    "Amazing performance!",
    "Totally disappointed."
]

# Use the same SentenceTransformer model used for training (variable 'model')
cleaned = [clean_text(t) for t in custom_tweets]
custom_embeddings = model.encode(cleaned)

# Perform prediction
predictions = clf.predict(custom_embeddings)

for tweet, pred in zip(custom_tweets, predictions):
    print(tweet, "->", le.inverse_transform([pred])[0])

Logistic Regression performed well with embedding features, achieving strong F1-scores across all classes. The neutral class showed slightly lower recall due to semantic overlap with positive tweets. Embedding-based representation improved generalization compared to traditional TF-IDF methods.