<a href="https://colab.research.google.com/github/padmavatisumare/Sentiment-Analysis-Project/blob/main/SentimentAnalysisProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
# 1. Load Data
import pandas as pd

file_path = r"/content/tweets.csv.csv"
df = pd.read_csv(file_path, encoding='latin-1', header=None)

# Add column names
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']

print("Data loaded successfully ✅")
print(df.head())


Data loaded successfully ✅
   target          id                          date      flag  \
0       0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1       0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2       0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3       0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4       0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   

              user                                               text  
0  _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww, t...  
1    scotthamilton  is upset that he can't update his Facebook by ...  
2         mattycus  @Kenichan I dived many times for the ball. Man...  
3          ElleCTF    my whole body feels itchy and like its on fire   
4           Karoli  @nationwideclass no, it's not behaving at all....  


In [23]:
# 2. Preprocess Data
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

# Keep only positive (4) and negative (0) tweets
df = df[df['target'].isin([0, 4])]
df['target'] = df['target'].replace({0: 0, 4: 1})  # 0: Negative, 1: Positive

# Clean the text
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r"http\S+|www\S+|@\w+|#\w+", '', text)  # Remove links, mentions, hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove punctuations/numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]  # Remove stopwords
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_text)

print("Data preprocessing done ✅")
print(df[['text', 'clean_text']].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data preprocessing done ✅
                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                          clean_text  
0      thats bummer shoulda got david carr third day  
1  upset cant update facebook texting might cry r...  
2  dived many times ball managed save rest go bounds  
3                   whole body feels itchy like fire  
4                           behaving im mad cant see  


In [24]:
# 3. Vectorize Text
from sklearn.feature_extraction.text import TfidfVectorizer

X = df['clean_text']
y = df['target']

vectorizer = TfidfVectorizer(max_features=5000)
X_vec = vectorizer.fit_transform(X)

print("Text vectorization completed ✅")


Text vectorization completed ✅


In [25]:
# 4. Train Model
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

print("Model training done ✅")


Model training done ✅


In [26]:
# 5. Evaluate Model
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Model Evaluation Results ✅")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Model Evaluation Results ✅
Accuracy: 0.77343125

Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.75      0.77    159494
           1       0.76      0.80      0.78    160506

    accuracy                           0.77    320000
   macro avg       0.77      0.77      0.77    320000
weighted avg       0.77      0.77      0.77    320000



In [27]:
# 6. Predict on New Data
def predict_sentiment(tweet):
    tweet = clean_text(tweet)
    tweet_vec = vectorizer.transform([tweet])
    prediction = model.predict(tweet_vec)
    return "Positive 😀" if prediction[0] == 1 else "Negative 😞"

# Example predictions
print(predict_sentiment("I love this product! It's amazing."))
print(predict_sentiment("This is the worst thing ever."))


Positive 😀
Negative 😞
