In [9]:
# === Step 1: Upload Files ===
from google.colab import files
import zipfile
import pandas as pd
import re
import joblib
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Upload both files: twitter_training.csv.zip and twitter_validation.csv
print("👉 Please select both files: twitter_training.csv.zip and twitter_validation.csv")
uploaded = files.upload()
# === Step 2: Unzip training dataset ===
with zipfile.ZipFile('twitter_training.csv.zip', 'r') as zip_ref:
    zip_ref.extractall('./')

# === Step 3: Load Datasets ===
train_df = pd.read_csv('twitter_training.csv', header=None)
val_df = pd.read_csv('twitter_validation.csv', header=None)

# Assign column names
train_df.columns = ['id', 'entity', 'sentiment', 'text']
val_df.columns = ['id', 'entity', 'sentiment', 'text']

# === Step 4: Preprocess the Text ===
def clean_text(text):
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)
    text = re.sub(r'\@w+|\#', '', text)
    text = re.sub(r'[^A-Za-z\s]', '', text)
    text = text.lower()
    return text

train_df['clean_text'] = train_df['text'].astype(str).apply(clean_text)
val_df['clean_text'] = val_df['text'].astype(str).apply(clean_text)

# === Step 5: Encode Labels ===
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['sentiment'])
val_df['label'] = le.transform(val_df['sentiment'])

# === Step 6: TF-IDF + Logistic Regression ===
vectorizer = TfidfVectorizer(max_features=5000)
X_train = vectorizer.fit_transform(train_df['clean_text'])
X_val = vectorizer.transform(val_df['clean_text'])

y_train = train_df['label']
y_val = val_df['label']

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# === Step 7: Evaluation ===
y_pred = model.predict(X_val)
print("Classification Report:\n")
print(classification_report(y_val, y_pred, target_names=le.classes_))

# === Step 8: Save Model and Vectorizer ===
joblib.dump(model, 'twitter_sentiment_model.pkl')
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

print("\n✅ Model and vectorizer saved as .pkl files.")



👉 Please select both files: twitter_training.csv.zip and twitter_validation.csv


Saving twitter_validation.csv to twitter_validation.csv
Saving twitter_training.csv.zip to twitter_training.csv (3).zip
Classification Report:

              precision    recall  f1-score   support

  Irrelevant       0.80      0.73      0.76       172
    Negative       0.77      0.89      0.83       266
     Neutral       0.84      0.72      0.78       285
    Positive       0.80      0.85      0.83       277

    accuracy                           0.80      1000
   macro avg       0.80      0.80      0.80      1000
weighted avg       0.81      0.80      0.80      1000


✅ Model and vectorizer saved as .pkl files.
