<a href="https://colab.research.google.com/github/mrunalkavade/Sentimental-Analysis-/blob/main/Sentimental_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --------------------------------------------
# Step 0: Setup Kaggle API (ONLY for Colab or API use)
# --------------------------------------------
!pip install -q kaggle

# Move your kaggle.json file to the correct location
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset from Kaggle
!kaggle datasets download -d kazanova/sentiment140

# --------------------------------------------
# Step 1: Unzip the dataset
# --------------------------------------------
from zipfile import ZipFile

dataset = '/content/sentiment140.zip'
with ZipFile(dataset, 'r') as zip_ref:
    zip_ref.extractall()
    print("✅ Dataset extracted")

# --------------------------------------------
# Step 2: Load the dataset
# --------------------------------------------
import pandas as pd

df = pd.read_csv("training.1600000.processed.noemoticon.csv", encoding='latin-1', header=None)
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
twitter_dataset = df[['text', 'target']]
twitter_dataset.replace({'target': {4: 1}}, inplace=True)  # Convert 4 to 1

# --------------------------------------------
# Step 3: Preprocess the tweets
# --------------------------------------------
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = re.sub(r'http\S+|@\S+|#\S+|[^a-zA-Z]', ' ', text)
    text = text.lower()
    words = text.split()
    words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(words)

twitter_dataset['stemmed_content'] = twitter_dataset['text'].apply(clean_text)

# --------------------------------------------
# Step 4: Prepare Features and Labels
# --------------------------------------------
X = twitter_dataset['stemmed_content'].values
Y = twitter_dataset['target'].values

# --------------------------------------------
# Step 5: Train/Test Split
# --------------------------------------------
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# --------------------------------------------
# Step 6: Text Vectorization
# --------------------------------------------
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# --------------------------------------------
# Step 7: Train the model
# --------------------------------------------
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vec, y_train)

# --------------------------------------------
# Step 8: Evaluate model
# --------------------------------------------
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_vec)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# --------------------------------------------
# Step 9: Predict new text
# --------------------------------------------
def predict_sentiment(tweet):
    cleaned = clean_text(tweet)
    vec = vectorizer.transform([cleaned])
    pred = model.predict(vec)[0]
    return "Positive 😊" if pred == 1 else "Negative 😠"

print(predict_sentiment("I absolutely love this!"))
print(predict_sentiment("This is terrible service."))

# --------------------------------------------
# Step 10: Save the model (Optional)
# --------------------------------------------
import joblib

joblib.dump(model, "sentiment_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

# --------------------------------------------
# Step 11: Take user input for prediction
# --------------------------------------------
while True:
    user_input = input("Enter a tweet (or type 'exit' to quit): ")
    if user_input.lower() == 'exit':
        print("Goodbye! 👋")
        break
    result = predict_sentiment(user_input)
    print("Sentiment:", result)



Dataset URL: https://www.kaggle.com/datasets/kazanova/sentiment140
License(s): other
Downloading sentiment140.zip to /content
  0% 0.00/80.9M [00:00<?, ?B/s]
100% 80.9M/80.9M [00:00<00:00, 1.15GB/s]
✅ Dataset extracted


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset.replace({'target': {4: 1}}, inplace=True)  # Convert 4 to 1
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  twitter_dataset['stemmed_content'] = twitter_dataset['text'].apply(clean_text)


✅ Accuracy: 0.76791875
              precision    recall  f1-score   support

           0       0.78      0.74      0.76    159494
           1       0.76      0.79      0.77    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000

Positive 😊
Negative 😠
Enter a tweet (or type 'exit' to quit): amazing
Sentiment: Positive 😊
Enter a tweet (or type 'exit' to quit): shut up
Sentiment: Negative 😠
Enter a tweet (or type 'exit' to quit): useless
Sentiment: Negative 😠


KeyboardInterrupt: Interrupted by user