<a href="https://colab.research.google.com/github/nishita-502/Twitter-Analysis/blob/main/twitter_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**IMPORTING** **LIBRARIES**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

**Loading Dataset**

In [None]:
df=pd.read_csv("/content/twitter_training.csv",encoding="latin-1")
df.head()

Unnamed: 0,Twitter ID,Entity,Label,Text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
new_df=df[["Label","Text"]]
new_df.head()

Unnamed: 0,Label,Text
0,Positive,im getting on borderlands and i will murder yo...
1,Positive,I am coming to the borders and I will kill you...
2,Positive,im getting on borderlands and i will kill you ...
3,Positive,im coming on borderlands and i will murder you...
4,Positive,im getting on borderlands 2 and i will murder ...


In [None]:
new_df=new_df.dropna()

In [None]:
new_df.shape

(73996, 2)

**Importing more libraries**

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string

Downloading required data

In [None]:
# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
class SentimentAnalyzer:
    def __init__(self):
        self.vectorizer = TfidfVectorizer(max_features=5000)
        self.classifier = LogisticRegression(max_iter=1000)
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def preprocess_text(self, text):
        """Clean and preprocess the text data"""
        if pd.isna(text):
            return ""
        text = str(text).lower()
        text = re.sub(r'https?://\S+|www\.\S+', '', text)
        text = re.sub(r'@\w+|#\w+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        tokens = word_tokenize(text)
        tokens = [self.lemmatizer.lemmatize(token) for token in tokens
                 if token not in self.stop_words and token.isalnum()]
        return ' '.join(tokens)

    def prepare_data(self, df):
        """Prepare the dataset for training"""
        prepared_df = df.copy()
        prepared_df['Label'] = prepared_df['Label'].replace({'Positive': 1, 'Negative': 0, 'Neutral': 2, 'Irrelevant': 3})
        print("Preprocessing texts...")
        prepared_df['processed_text'] = prepared_df['Text'].apply(self.preprocess_text)
        prepared_df = prepared_df[prepared_df['processed_text'] != ""]
        prepared_df['Label'] = prepared_df['Label'].astype(int)
        return prepared_df
    def train(self, X_train, y_train):
        """Train the sentiment analysis model"""
        print("Vectorizing text...")
        X_train_vectorized = self.vectorizer.fit_transform(X_train)
        print("Training classifier...")
        self.classifier.fit(X_train_vectorized, y_train)
    def evaluate(self, X_test, y_test):
        """Evaluate the model performance"""
        X_test_vectorized = self.vectorizer.transform(X_test)
        predictions = self.classifier.predict(X_test_vectorized)
        print("\nClassification Report:")
        print(classification_report(y_test, predictions, target_names=['Negative', 'Positive', 'Neutral', 'Irrelevant']))
        print("\nConfusion Matrix:")
        print(confusion_matrix(y_test, predictions))
        return predictions
    def predict(self, texts):
        """Predict sentiment for new texts"""
        processed_texts = [self.preprocess_text(text) for text in texts]
        vectorized_texts = self.vectorizer.transform(processed_texts)
        predictions = self.classifier.predict(vectorized_texts)
        label_map = {0: 'Negative', 1: 'Positive', 2: 'Neutral'}
        return [label_map[pred] for pred in predictions]


In [None]:
def main():
    analyzer = SentimentAnalyzer()
    prepared_df = analyzer.prepare_data(new_df)
    print("\nLabel distribution after preprocessing:")
    print(prepared_df['Label'].value_counts())
    X_train, X_test, y_train, y_test = train_test_split(
        prepared_df['processed_text'],
        prepared_df['Label'],
        test_size=0.2,
        random_state=42,
        stratify=prepared_df['Label']
    )
    analyzer.train(X_train, y_train)
    predictions = analyzer.evaluate(X_test, y_test)
    new_texts = [
        "This is absolutely amazing!",
        "I'm not sure how I feel about this",
        "This is the worst experience ever"
    ]
    print("\nPredictions for new texts:")
    predictions = analyzer.predict(new_texts)
    for text, pred in zip(new_texts, predictions):
        print(f"Text: {text}")
        print(f"Prediction: {pred}\n")

if __name__ == "__main__":
    main()

Preprocessing texts...

Label distribution after preprocessing:
Label
0    21901
1    20157
2    17640
3    12616
Name: count, dtype: int64
Vectorizing text...
Training classifier...

Classification Report:
              precision    recall  f1-score   support

    Negative       0.70      0.79      0.74      4380
    Positive       0.68      0.72      0.70      4032
     Neutral       0.67      0.62      0.64      3528
  Irrelevant       0.66      0.52      0.58      2523

    accuracy                           0.68     14463
   macro avg       0.68      0.66      0.67     14463
weighted avg       0.68      0.68      0.68     14463


Confusion Matrix:
[[3439  363  363  215]
 [ 473 2922  417  220]
 [ 574  531 2183  240]
 [ 457  461  292 1313]]

Predictions for new texts:
Text: This is absolutely amazing!
Prediction: Positive

Text: I'm not sure how I feel about this
Prediction: Neutral

Text: This is the worst experience ever
Prediction: Negative

