<a href="https://colab.research.google.com/github/manishsahu001/AI-ML-DL/blob/main/NLP_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# NLP For Machine Learning

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset train.txt, sep means separator in the text file we have ; as separator, and giving the name text and emotions
df = pd.read_csv("train.txt", sep=';', header=None, names=['text', 'emotions'])

In [None]:
df.head()

In [None]:
# Checking null values
df.isnull().sum()

In [None]:
# Checking unique emotions
df['emotions'].unique()

# Text Cleaning

In [None]:
# converting these emotions into numbers
unique_emotions = df['emotions'].unique()
emotion_numbers = {}
i = 0
for emo in unique_emotions:
  emotion_numbers[emo] = i
  i += 1

df['emotions'] = df['emotions'].map(emotion_numbers)

In [None]:
df.head()

In [None]:
# Converting the text into lowercese
df['text'] = df['text'].apply(lambda x: x.lower())

In [None]:
# remove punctuations
import string
def remove_punc(txt):
  return txt.translate(str.maketrans('', '', string.punctuation))

In [None]:
# applying punctuation function in the dataset text
df['text'] = df['text'].apply(remove_punc)

In [None]:
df.head()

In [None]:
# Remove numbers
def remove_numbers(txt):
  new = ""
  for i in txt:
    if not i.isdigit():
      new = new + i
  return new

In [None]:
df['text'] = df['text'].apply(remove_numbers)

In [None]:
df.head()

In [None]:
# todo: remove url from the text
# todo: remove html tags from the text

In [None]:
# remove emojis and special characters from text
def remove_emojis(txt):
  new = ""
  for i in txt:
    if i.isascii():
      new += i
  return new

In [None]:
df['text'] = df['text'].apply(remove_emojis)

In [None]:
# Remove stop word like is, was, are etc. using NLTK library (Natural Language Toolkit)
import nltk

In [None]:
# import corpos from nltk
from nltk.corpus import stopwords
# import tokeniza
from nltk.tokenize import word_tokenize

In [None]:
# Download punkt which will tokenize the words
# Download stop words
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

In [None]:
# Once download is done now we will use stop word
stop_words = set(stopwords.words('english'))

In [None]:
# Now we will get the text and tokenize it.
df.loc[1]['text']

In [None]:
# function to remove the stop words
def remove_stop_word(txt):
  words = word_tokenize(txt)
  cleaned_text = []
  for i in words:
    if not i in stop_words:
      cleaned_text.append(i)
  return ' '.join(cleaned_text)

In [None]:
# apply the remove stop word function
df['text'] = df['text'].apply(remove_stop_word)

In [None]:
df.loc[1]['text']

In [None]:
df.head()

In [None]:
# Train test split for the model training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['emotions'], test_size=0.20, random_state=42)

In [None]:
# before training the model we need to do vectorization, we already have the numeric value in emotions so we don't need to do anything with that but the text needs to be convert in the vector
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

In [None]:
# Creating model using CounVectorizer model (Bag of words)
bow_vectorizer = CountVectorizer()

In [None]:
# Converting the text to vectors
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

In [None]:
# Now we will train the model Naive Bayes and Logistic Regression both to get the best model, WE cannot use GussianNB we need to use MultinomialNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [None]:
nb_model = MultinomialNB()

In [None]:
nb_model.fit(X_train_bow, y_train)

In [None]:
y_pred_nb = nb_model.predict(X_test_bow)

In [None]:
# checking accuracy score
accuracy_score(y_test, y_pred_nb)

In [None]:
# With TFIDE model
tfidf_vectorizer = TfidfVectorizer()

In [None]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# naive bayes model
nb2_model = MultinomialNB()

In [None]:
nb2_model.fit(X_train_tfidf, y_train)

In [None]:
# prediction with the model
y_pred_tfidf_nb = nb2_model.predict(X_test_tfidf)

In [None]:
# Checking accuracy score
accuracy_score(y_test, y_pred_tfidf_nb)

In [None]:
# using Logistic Regressiong
from sklearn.linear_model import LogisticRegression
log_model = LogisticRegression(max_iter=1000)

# With CountVectorizer
log_model.fit(X_train_bow, y_train)
y_pred_bow_log = log_model.predict(X_test_bow)
accuracy_score(y_test, y_pred_bow_log)

In [None]:
# with TF-IDF
log2_model = LogisticRegression(max_iter=1000)
log2_model.fit(X_train_tfidf, y_train)
y_pred_tfidf_log = log2_model.predict(X_test_tfidf)
accuracy_score(y_test, y_pred_tfidf_log)

In [157]:
# Using SVM
from sklearn.svm import SVC
svm_model = SVC()


In [158]:
svm_model.fit(X_train_bow, y_train)

In [159]:

y_pred_svm_bow = svm_model.predict(X_test_bow)


In [161]:
accuracy_score(y_test, y_pred_svm_bow)

0.8225