<a href="https://colab.research.google.com/github/manjushree7/AI-ML/blob/main/2331414_ManjushreeTamang_Workshop8_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Text Classification using Machine Learning Models


### 📝 Instructions: Trump Tweet Sentiment Classification

1. **Load the Dataset**  
   Load the dataset named `"trump_tweet_sentiment_analysis.csv"` using `pandas`. Ensure the dataset contains at least two columns: `"text"` and `"label"`.

2. **Text Cleaning and Tokenization**  
   Apply a text preprocessing pipeline to the `"text"` column. This should include:
   - Lowercasing the text  
   - Removing URLs, mentions, punctuation, and special characters  
   - Removing stopwords  
   - Tokenization (optional: stemming or lemmatization)
   - "Complete the above function"

3. **Train-Test Split**  
   Split the cleaned and tokenized dataset into **training** and **testing** sets using `train_test_split` from `sklearn.model_selection`.

4. **TF-IDF Vectorization**  
   Import and use the `TfidfVectorizer` from `sklearn.feature_extraction.text` to transform the training and testing texts into numerical feature vectors.

5. **Model Training and Evaluation**  
   Import **Logistic Regression** (or any machine learning model of your choice) from `sklearn.linear_model`. Train it on the TF-IDF-embedded training data, then evaluate it using the test set.  
   - Print the **classification report** using `classification_report` from `sklearn.metrics`.


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/AI and ML/Week8/trum_tweet_sentiment_analysis.csv')

In [None]:
print(df.columns)

Index(['text', 'Sentiment'], dtype='object')


In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

# Make sure you have the necessary NLTK resources:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

def text_cleaning_pipeline(dataset, rule="lemmatize"):
    """
    This function performs basic text cleaning including:
    - Lowercasing
    - Removing URLs, emojis, and unwanted characters
    - Tokenizing
    - Removing stopwords
    - Lemmatizing or stemming
    """
    # Convert to lowercase
    data = dataset.lower()

    # Remove URLs
    data = re.sub(r'http\S+|www\S+|https\S+', '', data, flags=re.MULTILINE)

    # Remove emojis (basic pattern for unicode emojis)
    data = re.sub(r'[^\x00-\x7F]+', '', data)

    # Remove all other unwanted characters (punctuation, numbers, etc.)
    data = re.sub(r'[^a-z\s]', '', data)

    # Create tokens
    tokens = data.split()

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    if rule == "lemmatize":
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    elif rule == "stem":
        stemmer = PorterStemmer()
        tokens = [stemmer.stem(word) for word in tokens]
    else:
        print("Pick between lemmatize or stem")

    return " ".join(tokens)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
print(df["text"].isnull().sum())

0


In [None]:
# Apply the text cleaning function to the 'text' column to create 'cleaned_text'
df["cleaned_text"] = df["text"].apply(lambda x: text_cleaning_pipeline(x, rule="lemmatize"))

In [None]:
from sklearn.model_selection import train_test_split

# Example: Assuming you have a 'label' column for classification
X = df["cleaned_text"]          # Features
y = df["Sentiment"]                 # Target sentiment

# Split the data (e.g., 80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data (X_train)
X_train_tfidf = vectorizer.fit_transform(X_train)

# Transform the testing data (X_test) using the same vectorizer
X_test_tfidf = vectorizer.transform(X_test)

# Check the shape of the transformed data
print(f"Shape of training data: {X_train_tfidf.shape}")
print(f"Shape of testing data: {X_test_tfidf.shape}")

Shape of training data: (1480098, 264895)
Shape of testing data: (370025, 264895)


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Initialize the Logistic Regression model
model = LogisticRegression(max_iter=1000)  # max_iter may be adjusted if convergence is not reached

# Train the model on the TF-IDF-transformed training data
model.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = model.predict(X_test_tfidf)

# Evaluate the model using classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.97      0.96    248842
           1       0.94      0.91      0.92    121183

    accuracy                           0.95    370025
   macro avg       0.95      0.94      0.94    370025
weighted avg       0.95      0.95      0.95    370025

