<a href="https://colab.research.google.com/github/nagarjuna741621/ML/blob/main/Naive_Bayes_AP23110011370.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TASK #1: UNDERSTAND THE PROBLEM AND BUSINESS CASE

In [None]:
from google.colab import drive
drive.mount('/content/drive')

![image.png](attachment:image.png)

In [None]:
# Data Source: https://www.kaggle.com/samdeeplearning/deepnlp

![image.png](attachment:image.png)

# TASK #2: IMPORT LIBRARIES AND DATASETS

In [None]:
# install nltk
!pip install nltk

In [None]:
# install gensim
!pip install gensim

In [None]:
!pip install jupyterthemes

In [None]:
!pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS
import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
from jupyterthemes import jtplot
jtplot.style(theme='monokai', context='notebook', ticks=True, grid=False)
# setting the style of the notebook to be monokai theme
# this line of code is important to ensure that we are able to see the x and y axes clearly
# If you don't run this code line, you will notice that the xlabel and ylabel on any plot is black on black and it will be hard to see them.


In [None]:
# load the data
resume_df = pd.read_csv('resume.csv',encoding = 'latin-1')
resume_df

In [None]:
# data containing resume
resume_df = resume_df[['resume_text','class']]
resume_df

MINI CHALLENGE #1:
- Print the first and last elements in the dataframe.

In [None]:
print(resume_df.head(10))
print(resume_df.tail(10))

# TASK #3: PERFORM EXPLORATORY DATA ANALYSIS

In [None]:
# obtain dataframe information
resume_df.info()

In [None]:
# check for null values
resume_df.isnull().sum()

In [None]:
resume_df['class'].value_counts()

In [None]:
resume_df['class'] = resume_df['class'].apply(lambda x:0 if x == 'not_flagged' else 1)
resume_df['class'].value_counts()

MINI CHALLENGE #2:
- Divide the DataFrame into two, one that belongs to class 0 and 1. Do we have a balanced dataset?

In [None]:
class_0_df = resume_df[resume_df['class'] == 0]
class_0_df

In [None]:
class_1_df = resume_df[resume_df['class'] == 1]
class_1_df

# TASK #4: PERFORM DATA CLEANING

In [None]:
resume_df['resume_text'] = resume_df['resume_text'].apply(lambda x: x.replace('\r',''))
resume_df

In [None]:
# download nltk packages
nltk.download('punkt')

In [None]:
# download nltk packages
nltk.download("stopwords")

In [None]:
# Get additional stopwords from nltk
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from','subject','re','use','email','com'])

In [None]:
# Remove stop words and remove words with 2 or less characters
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text) :
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 2 and token not in stop_words:
            result.append(token)

    return ' '.join(result)

In [None]:
# Cleaned text
resume_df['cleaned'] = resume_df['resume_text'].apply(preprocess)

In [None]:
resume_df

In [None]:
print(resume_df['cleaned'][0])

In [None]:
print(resume_df['resume_text'][0])

# TASK #5: VISUALIZE CLEANED DATASET

In [None]:
# Plot the counts of flagged vs not flagged
sns.countplot(x=resume_df['class'], label = 'Count Plot')
plt.title('Distribution of Resume Classes')
plt.xlabel('Class')
plt.ylabel('Count')
plt.show()

In [None]:
# plot the word cloud for text that is flagged
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(str(resume_df[resume_df['class'] == 1].cleaned))
plt.imshow(wc , interpolation = 'bilinear')

MINI CHALLENGE #3:
- Plot the wordcloud for class #1

In [None]:
# plot the word cloud for text that is flagged
plt.figure(figsize = (20,20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(str(resume_df[resume_df['class'] == 0].cleaned))
plt.imshow(wc , interpolation = 'bilinear')

# TASK #6: PREPARE THE DATA BY APPLYING COUNT VECTORIZER

![image.png](attachment:image.png)

In [None]:
# CountVectorizer example
from sklearn.feature_extraction.text import CountVectorizer
sample_data = ['Hello World', 'Hello Hello Hello World world', 'Hello Hello World world world World']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)

print(vectorizer.get_feature_names_out())
print(X.toarray())

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(resume_df['cleaned'])

In [None]:
# Applying CountVectorier to the cleaned text
print(vectorizer.get_feature_names_out())

In [None]:
print(X.toarray())

# TASK #7: UNDERSTAND THE THEORY AND INTUITION BEHIND NAIVE BAYES CLASSIFIERS - PART #1

# TASK #8: UNDERSTAND THE THEORY AND INTUITION BEHIND NAIVE BAYES CLASSIFIERS - PART #2

![image.png](attachment:image.png)

![image.png](attachment:image.png)

![image.png](attachment:image.png)

MINI CHALLENGE #4:
- Calculate the probability of the red class (non-retiring).

# TASK#9: TRAIN NAIVE BAYES CLASSIFIER MODEL

In [None]:
X.shape

In [None]:
y = resume_df['class']

In [None]:
y.shape

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
from sklearn.naive_bayes import MultinomialNB
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

MINI CHALLENGE #5:
- Split the data into 25% testing and 75% training and perform a sanity check



In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

# TASK #10: ASSESS TRAINED MODEL PERFORMANCE


![image.png](attachment:image.png)

In [None]:
# Predicting the performance on train data
y_predict_train = NB_classifier.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot = True)

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)

In [None]:
# classification report
print(classification_report(y_test, y_predict_test))

MINI CHALLENGE #6:
- Retrain the model after spliting the data into 30% testing and 70% training and assess model performance


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)

# classification report
print(classification_report(y_test, y_predict_test))

# GREAT JOB!

# MINI CHALLENGES SOLUTIONS

MINI CHALLENGE #1 SOLUTION:
- Print the first and last elements in the dataframe.

In [None]:
resume_df.head()

In [None]:
resume_df.tail()

MINI CHALLENGE #2 SOLUTION:
- Divide the DataFrame into two, one that belongs to class 0 and 1. Do we have a balanced dataset?

In [None]:
class_0_df = resume_df[resume_df['class']==0]
class_0_df

In [None]:
class_1_df = resume_df[resume_df['class']==1]
class_1_df

MINI CHALLENGE #3 SOLUTION:
- Plot the wordcloud for class #1

In [None]:
# plot the word cloud for text that is not flagged
plt.figure(figsize = (20, 20))
wc = WordCloud(max_words = 2000 , width = 1600 , height = 800 , stopwords = stop_words).generate(str(resume_df[resume_df['class'] == 0].cleaned))
plt.imshow(wc , interpolation = 'bilinear')

MINI CHALLENGE #4 SOLUTION:
- Calculate the probability of the red class (non-retiring).

![image.png](attachment:image.png)

MINI CHALLENGE #5 SOLUTION:
- Split the data into 25% testing and 75% training and perform a sanity check


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

MINI CHALLENGE #6 SOLUTION:
- Retrain the model after spliting the data into 30% testing and 70% training and assess model performance


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot = True)

# classification report
print(classification_report(y_test, y_predict_test))