In [2]:
# Naive Bayes and Sentiment Analysis:

In [3]:
# Importing all the necessary libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
# Data Preparation:

In [5]:
df=pd.read_csv('blogs.csv')
df.head()

Unnamed: 0,Data,Labels
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism


In [6]:
# 1. Data Exploration and Preprocessing

In [7]:
df.shape

(2000, 2)

In [8]:
# Checking for missing values:
# There are no missing values found.
df.isna().sum()

Data      0
Labels    0
dtype: int64

In [9]:
import string
import nltk
from nltk.corpus import stopwords


# Download stopwords from nltk
nltk.download('stopwords')

# Initialize stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\subha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
# Function to preprocess text
def preprocess_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = text.split()
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

In [11]:
# Apply preprocessing to the Data column
df['Processed_Data'] = df['Data'].apply(preprocess_text)

# Display the first few rows of the processed data
print(df.head())

                                                Data       Labels  \
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism   
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism   

                                      Processed_Data  
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
1  newsgroups altatheism path cantaloupesrvcscmue...  
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  
4  xref cantaloupesrvcscmuedu altatheism53485 tal...  


In [12]:
df.head()

Unnamed: 0,Data,Labels,Processed_Data
0,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
1,Newsgroups: alt.atheism\nPath: cantaloupe.srv....,alt.atheism,newsgroups altatheism path cantaloupesrvcscmue...
2,Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...,alt.atheism,path cantaloupesrvcscmuedudasnewsharvardedunoc...
3,Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...,alt.atheism,path cantaloupesrvcscmuedumagnesiumclubcccmued...
4,Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...,alt.atheism,xref cantaloupesrvcscmuedu altatheism53485 tal...


In [13]:
# To see data present in a sample:

pd.set_option('display.max_colwidth', None)
df.Processed_Data.head(1)

0    path cantaloupesrvcscmuedumagnesiumclubcccmuedunewsseicmueducisohiostateeduzaphodmpsohiostateeduhowlandrestonansnetagatedocicacukuknetmcsungermanyeunetthothmchpsnidehorusapmchpsnided012s658frank frankd012s658uucp frank odwyer newsgroups altatheism subject islamic genocide date 23 apr 1993 235147 gmt organization siemensnixdorf ag lines 110 distribution world messageid 1r9vej5k5horusapmchpsnide references 1r4o8a6qefidoasdsgicom 1r5ublbd6horusapmchpsnide 1r76ek7uofidoasdsgicom nntppostinghost d012s658apmchpsnide article 1r76ek7uofidoasdsgicom liveseysolntzewpdsgicom jon livesey writes article 1r5ublbd6horusapmchpsnide frankd012s658uucp frank odwyer writes article 1r4o8a6qefidoasdsgicom liveseysolntzewpdsgicom jon livesey writes noting particular society case mainland uk religously motivated murders murders kind says little whether interreligion murders elsewhere religiously motivated allows one conclude nothing inherent religion matter catholicism protestantism motivates one kill mo

In [14]:
# 2. Naive Bayes Model for Text Classification:

In [15]:
# Building the model:

X=df['Processed_Data']
y=df['Labels']

In [16]:
# Split the data into training and test sets:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [17]:
# Extract features using TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [18]:
# Applying the NB classifier:
from sklearn.naive_bayes import MultinomialNB

# Train a Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)

# Make predictions on the test set
y_pred = nb_classifier.predict(X_test_tfidf)

In [19]:
# 3. Evaluation:

In [20]:
# Metrics:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
classification_rep = classification_report(y_test, y_pred)

print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')
print('Classification Report:')
print(classification_rep)

Accuracy: 0.8300
Precision: 0.8353
Recall: 0.8300
F1 Score: 0.8246
Classification Report:
                          precision    recall  f1-score   support

             alt.atheism       0.50      0.83      0.62        18
           comp.graphics       0.79      0.83      0.81        18
 comp.os.ms-windows.misc       0.86      0.82      0.84        22
comp.sys.ibm.pc.hardware       0.86      0.76      0.81        25
   comp.sys.mac.hardware       0.87      0.95      0.91        21
          comp.windows.x       0.91      0.84      0.88        25
            misc.forsale       0.79      0.83      0.81        18
               rec.autos       0.89      0.94      0.92        18
         rec.motorcycles       0.94      0.94      0.94        16
      rec.sport.baseball       0.77      0.94      0.85        18
        rec.sport.hockey       0.94      1.00      0.97        15
               sci.crypt       0.90      0.95      0.92        19
         sci.electronics       0.62      0.62      

In [21]:
# Summary of Metrics: Overall the performance of our model is really good.

# Accuracy (83.00%) :  This means that 83% of the blog posts were correctly categorized into their respective categories.
# Precision (83.53%):  This indicates that when the model predicts a category, it is correct 83.53% of the time.
# Recall (83.00%)   :  This measures how well the model can find all the relevant blog posts for each category. In this case, it correctly identifies 83% of 
#                      the relevant posts.
# F1 Score (82.46%) :  This is the harmonic mean of precision and recall, providing a balance between the two. An F1 score of 82.46% indicates that the model
#                      has a good balance of precision and recall.

In [22]:
# 4. Sentiment Analysis:

In [23]:
# Install the vaderSentiment library if not already installed
# !pip install vaderSentiment

In [24]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Function to analyze sentiment
def analyze_sentiment(text):
    scores = analyzer.polarity_scores(text)
    compound = scores['compound']
    if compound >= 0.05:
        return 'positive'
    elif compound <= -0.05:
        return 'negative'
    else:
        return 'neutral'

# Apply sentiment analysis to the Data column
df['Sentiment'] = df['Data'].apply(analyze_sentiment)

In [25]:
pd.set_option('display.max_colwidth', 50)

In [26]:
# Display the first few rows of the sentiment analysis results
print(df.head())

                                                Data       Labels  \
0  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
1  Newsgroups: alt.atheism\nPath: cantaloupe.srv....  alt.atheism   
2  Path: cantaloupe.srv.cs.cmu.edu!das-news.harva...  alt.atheism   
3  Path: cantaloupe.srv.cs.cmu.edu!magnesium.club...  alt.atheism   
4  Xref: cantaloupe.srv.cs.cmu.edu alt.atheism:53...  alt.atheism   

                                      Processed_Data Sentiment  
0  path cantaloupesrvcscmuedumagnesiumclubcccmued...  negative  
1  newsgroups altatheism path cantaloupesrvcscmue...  positive  
2  path cantaloupesrvcscmuedudasnewsharvardedunoc...  negative  
3  path cantaloupesrvcscmuedumagnesiumclubcccmued...  negative  
4  xref cantaloupesrvcscmuedu altatheism53485 tal...  positive  


In [27]:
# Examine the distribution of sentiments across different categories
sentiment_distribution = df.groupby('Labels')['Sentiment'].value_counts(normalize=True).unstack()
print(sentiment_distribution)

Sentiment                 negative  neutral  positive
Labels                                               
alt.atheism                   0.41     0.01      0.58
comp.graphics                 0.11     0.02      0.87
comp.os.ms-windows.misc       0.21     0.02      0.77
comp.sys.ibm.pc.hardware      0.18     0.01      0.81
comp.sys.mac.hardware         0.24     0.04      0.72
comp.windows.x                0.18     0.02      0.80
misc.forsale                  0.08     0.08      0.84
rec.autos                     0.27      NaN      0.73
rec.motorcycles               0.30     0.01      0.69
rec.sport.baseball            0.26     0.01      0.73
rec.sport.hockey              0.22     0.01      0.77
sci.crypt                     0.30      NaN      0.70
sci.electronics               0.19     0.03      0.78
sci.med                       0.30     0.01      0.69
sci.space                     0.32     0.03      0.65
soc.religion.christian        0.28      NaN      0.72
talk.politics.guns          

In [28]:
# Summary of Sentiment Distribution:

# Positive Sentiment Dominance:
#     Categories like comp.graphics, comp.os.ms-windows.misc, comp.sys.ibm.pc.hardware, comp.windows.x, and misc.forsale have a high proportion of positive 
#     sentiments, indicating that the content in these categories is generally well-received.
#     Categories related to hobbies and interests such as rec.autos, rec.motorcycles, rec.sport.baseball, and rec.sport.hockey also show a strong positive 
#     sentiment.

# Negative Sentiment Concerns:
#     Categories like alt.atheism, talk.politics.guns, talk.politics.mideast, and talk.politics.misc have a higher proportion of negative sentiments, 
#     reflecting potential controversy or dissatisfaction among users in these categories.
#     talk.politics.guns and talk.politics.mideast have the highest negative sentiments, which is not surprising given the often contentious nature of these 
#     topics.

# Neutral Sentiment Anomalies:
#     Some categories like rec.autos, sci.crypt, soc.religion.christian, talk.politics.mideast, and talk.politics.misc show NaN for neutral sentiment. This 
#     could be due to a lack of sufficient data points classified as neutral or an issue with the sentiment analysis process.

In [29]:
# Conclusion: We have performed all the tasks and made a relevant sentiment distribution on the blogs_categories.csv dataset using the naive bayes
# classifier