<a href="https://colab.research.google.com/github/pankajmahato907/AI_Workshops/blob/main/Workshop8_Text_Pre_Processing_Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [3]:
# Download required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
# Set up stopwords
stop_words = set(stopwords.words('english'))
custom_stopwords = ['@', 'RT','rt']
stop_words.update(custom_stopwords)

In [None]:
datapath = "/content/drive/MyDrive/Ai/trum_tweet_sentiment_analysis.csv"
df = pd.read_csv(datapath)


In [None]:
example = df['text'][52]
print(example)

RT @cnnbrk: A federal appeals court denied the US government's initial request to resume President Donald Trump's travel ban. https://t.co/gB70irt1AX


In [None]:
import re
def remove_urls(text):
  """
  This function will try to remove URL present in out dataset and replace it with space using regex library.
  Input Args:
  text: strings of text that may contain URLs.
  Output Args:
  text: URLs replaces with text
  """
  url_pattern = re.compile(r'https?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)


In [None]:
def remove_emoji(string):
  """
  This function will replace the emoji in string with whitespace
  """
  emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r' ', string)

In [None]:
def removeunwanted_characters(document):
  """
  This function will remove all the unwanted characters from the input dataset.
  Input Args:
  documet: A text data to be cleaned.
  Return:
  A cleaned document.
  """
  # remove user mentions
  document = re.sub("@[A-Za-z0-9_]+"," ", document)
  # remove hashtags
  document = re.sub("#[A-Za-z0-9_]+","", document)
  # remove punctuation
  document = re.sub("[^0-9A-Za-z ]", "" , document)
  # remove double spaces
  document = document.replace('  ',"")
  return document.strip()

In [None]:

def remove_stopwords(text_tokens):
  """
  This function removes all the stopwords present in out text tokens.
  Input Args:
  text_tokens: tokenize input of our datasets.
  Returns:
  result_tokens: list of token without stopword.
  """

  result_tokens = []
  for token in text_tokens:
    if token not in stop_words:
       result_tokens.append(token)
  return result_tokens

In [None]:
def lemmatization(token_text):
  """
  This function performs the lemmatization operations as explained above.
  Input Args:
  token_text: list of tokens.
  Returns:
  lemmatized_tokens: list of lemmatized tokens.
  """
  wordnet = WordNetLemmatizer()
  lemmatized_tokens = [wordnet.lemmatize(token, pos = 'v') for token in token_text]

  return lemmatized_tokens




In [None]:
def stemming(text):
  """
  This function performs stemming operations.
  Input Args:
  token_text: list of tokenize text.
  Returns:
  stemm_tokes: list of stemmed tokens.
  """
  porter = PorterStemmer()
  stemm_tokens = []
  for word in text:
    stemm_tokens.append(porter.stem(word))
  return stemm_tokens

In [None]:
def text_cleaning_pipeline(text, rule = "lemmatize"):
  """
  This function applies a complete text cleaning pipeline including:
  - Converting to lowercase
  - Removing URLs
  - Removing emojis
  - Removing unwanted characters
  - Tokenization
  - Removing stopwords
  - Lemmatization or stemming (based on rule parameter)

  Input Args:
  text: Input text string to be cleaned
  rule: "lemmatize" or "stem" to specify which text normalization to use

  Returns:
  Cleaned and normalized text as a string
  """
  # Convert the input to small/lower order.
  data = text.lower()
  # Remove URLs
  data = remove_urls(data)
  # Remove emojis
  data = remove_emoji(data)
  # Remove all other unwanted characters.
  data = removeunwanted_characters(data)
  # Create tokens.
  tokens = data.split()
  # Remove stopwords:
  tokens = remove_stopwords(tokens)

  if rule == "lemmatize":
    tokens = lemmatization(tokens)
  elif rule == "stem":
    tokens = stemming(tokens)
  else:
    print("Pick between lemmatize or stem")

  return " ".join(tokens)

In [None]:
# Apply the text cleaning pipeline to the dataset
print("Step 2: Applying text cleaning pipeline...")
df['cleaned_text'] = df['text'].apply(lambda x: text_cleaning_pipeline(x, rule="lemmatize"))
print("Text cleaning completed.")

Step 2: Applying text cleaning pipeline...
Text cleaning completed.


In [None]:
#display a sample of cleaned and original text
df[['text', 'cleaned_text']].sample(10)

Unnamed: 0,text,cleaned_text
399376,"RT @michikokakutani: NYT Editorial: ""Rather th...",rt nyt editorial rather endorse american excep...
1351498,"RT @greenhousenyt: From 1975 to 2015, terroris...",rt 1975 2015 terrorists trump 7 muslim nations...
1808532,RT @TheMarkRomano: Obama kept a lot of info ab...,rt obama keep lot info awful iran deal hide pu...
101147,RT @5osdd: Trump is shutting Iran down\r\r\n#T...,rt trump shut iran
1391768,FT: The top Democrat in the House of Represent...,ft top democrat house representatives call don...
1176854,RT @mikandynothem: It is breathtaking how Libe...,rt breathtaking liberal america forget danger ...
157974,"RT @Alexey__Kovalev: Solovyov: ""There will be ...",rt solovyov thorough investigation ukraines me...
158573,DON'T WE ALL JUST LOVE TRUMP AND PENCE? THEY'...,dont love trump pencetheyre like new bffs
1311751,RT @dawnbazely: Donald Trump Picked a Twitter ...,rt donald trump pick twitter fight mark cuban ...
632401,Two administrations basically calling intellig...,two administrations basically call intelligenc...


In [None]:
#splitting the data into train and test set
x = df['cleaned_text']
y = df['Sentiment']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
#tf-idf vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
X_test_tfidf = tfidf_vectorizer.transform(x_test)

In [None]:
#Model training
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)
print("Model training completed.")

Model training completed.


In [None]:
# Make predictions
print("\nEvaluating model...")
y_pred = lr_model.predict(X_test_tfidf)


Evaluating model...


In [None]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.96      0.94    248563
           1       0.90      0.86      0.88    121462

    accuracy                           0.92    370025
   macro avg       0.92      0.91      0.91    370025
weighted avg       0.92      0.92      0.92    370025



In [None]:
#sample prediction
sample_index = min(1129038	, len(df)-1)
print(sample_index)
print(df['Sentiment'][sample_index])
sample_text = df['text'][sample_index]
sample_cleaned = text_cleaning_pipeline(sample_text)
sample_tfidf = tfidf_vectorizer.transform([sample_cleaned])
sample_pred = lr_model.predict(sample_tfidf)[0]

1129038
1


In [None]:
#print the text where sentiment is 1
df[df['Sentiment'] == 1].sample(10)

Unnamed: 0,text,Sentiment,cleaned_text
1129038,RT AP_Politics: After day of bonding on golf c...,1,appolitics day bond golf course trump japanese...
1462861,"This is so much more believable than any 23,00...",1,much believable 23000word medium blog trump 14...
443120,RT @CBSNews: Documents show Pres. Trump is sti...,1,document show pres trump still position financ...
1162464,"RT @Prime_Politics_: As Trump Tweets, Legal Co...",1,trump tweet legal community turn eye john robe...
843925,RT @nytimes: Trump's wall would take 3 and a h...,1,trump wall would take 3 half years build would...
1061208,RT @davebernstein: This is a MAJOR story... \r...,1,major story confirm many us know long timemich...
1248634,RT @hrkbenowen: RETWEET if you agree with Trum...,1,retweet agree trump aide stephen miller us abs...
1842886,"RT MiddleEastEye ""Climate change: How Trump co...",1,middleeasteye climate change trump could becom...
549871,"RT @RosaCTV: Worth noting, as others have, Don...",1,worth note others donald trump tweet nordstrom...
139736,Excellant way to explain why trump isnt hilter...,1,excellant way explain trump isnt hilter


In [None]:
print(f"Original Text: {sample_text}")
print(f"Cleaned Text: {sample_cleaned}")
print(f"Predicted Sentiment: {sample_pred}")

Original Text: RT AP_Politics: After day of bonding on golf course, Trump and Japanese leader have that alliance tested by launch https://t.co/3iqPOS7zh9
Cleaned Text: appolitics day bond golf course trump japanese leader alliance test launch
Predicted Sentiment: 1
