## Installing dependencies

In [1]:
!pip install numpy pandas



In [2]:
import numpy as np
import pandas as pd

In [3]:
df=pd.read_csv('/content/reddit.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/reddit.csv'

In [None]:
df.shape

Viewing some samples from the clean_comment column

In [None]:
df.sample()['clean_comment'].values

In [None]:
df.info()

In [None]:
df.isnull().sum()

**Removing the null values**
> If the number of missing values are reasonable to the whole dataset, only then we should be dropping the missing columns



In [None]:
df[df['clean_comment'].isna()]

In [None]:
df[df['clean_comment'].isna()]['category'].value_counts()

In [None]:
df.dropna(inplace=True)

**Checking for the duplicate values**

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

Dropping the duplicate values as the number of duplicates is only 350 to 37,000

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

**Checking for New Lines** \
Sometimes these new lines can be considered as a row itself.

In [None]:
df[(df['clean_comment'].str.strip() == '')]

In [None]:
df= df[~(df['clean_comment'].str.strip() == '')]

**Lowercasing all the comments**

In [None]:
# convert the 'clean_comment' column to lowercase
df['clean_comment'] = df['clean_comment'].str.lower()

df.head()

**Removing the whitespaces before and after the comments** \
In NLP context, the presence of whitespaces as a prefix or suffix can cost extra burn of tokens, so they are considered to be removed for efficiency.

In [None]:
df[df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' '))]

In [None]:
#Remove trailing and leading whitespaces from the 'clean_column' column
df['clean_comment'] = df['clean_comment'].str.strip()

# Verify the transformation by checking for any remaining trailing whitespaces
df[df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' '))]

**Checking for URLs in the data**

In [None]:
# Identify comments containing URLs
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
comments_with_urls= df[df['clean_comment'].str.contains(url_pattern, regex=True)]

# Display
comments_with_urls.head()

**Removing new line characters (\n) from the the comments column**

In [None]:
# Identify comments containing new line characters
comments_with_newline = df[df['clean_comment'].str.contains('\n')]

# Display
comments_with_newline.head()

In [None]:
# Remove new line characters from the 'clean_comment' column
df['clean_comment'] = df['clean_comment'].str.replace('\n', ' ', regex=True)

# Verify
comments_with_newline_remaining = df[df['clean_comment'].str.contains('\n')]
comments_with_newline_remaining

## **EDA (Exploratory Data Analysis)**
In this section we will visualize the data in various point of views to get the information and insights out of it. EDA is performed to uncover hidden patterns, spot anomalies or outliers, and test underlying assumptions before applying formal modeling or machine learning techniques.

**Importing the visualization libraries**

In [None]:
!pip install seaborn matplotlib.pyplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Distribution of the classes
# positive(1) negative(-1) and neutral(0) categories

sns.countplot(data=df, x="category")

In [None]:
# frequency distribution of sentiments
df['category'].value_counts(normalize=True).mul(100).round(2)

Adding a new column which consists the number of words in a comment

In [None]:
df['word_count'] = df['clean_comment'].apply(lambda x: len(x.split()))
df.head()

In [None]:
df['word_count'].describe()

Visualizing the word count

In [None]:
sns.displot(df['word_count'], kde=True)

**Word Count Distribution by Category**

In [None]:
# Create the figure and axes
plt.figure(figsize=(10, 6))

# Plot KDE for category 1
sns.kdeplot(df[df['category'] == 1]['word_count'], label='Positive', fill=True)

# Plot KDE for category 0
sns.kdeplot(df[df['category'] == 0]['word_count'], label='Neutral', fill=True)

# Plot KDE for category -1
sns.kdeplot(df[df['category'] == -1]['word_count'], label='Negative', fill=True)

# Add title and labels
plt.title('Word Count Distribution by Category')
plt.xlabel('Word Count')
plt.ylabel('Density')

# Add a legend
plt.legend()

# Show the plot
plt.show()

**Explanation of the plot**
1. Positive comments (1) tends to have a wider spread in word count, meaning longer comments are more common in positive sentiments.
2. Neutral comments (0) shows lower frequency and more concentrated around shorter comments.
3. Negative comments (-1) have a distribution somewhat similar to positive comments but smaller proportion of longer comments.

**Boxplot**

In [None]:
sns.boxplot(df['word_count'])

In [None]:
# Create a boxplot for the 'wordcount' column categorized by 'category'
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='category', y='word_count')
plt.title('Boxplot of Word Count by Category')
plt.xlabel('Category')
plt.ylabel('Word Count')
plt.show()

In [None]:
# Create a scatterplot between 'category' and 'wordcount'
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='category', y='word_count', alpha=0.5)
plt.title('Scatterplot of Word Count by Category')
plt.xlabel('Category')
plt.ylabel('Word Count')
plt.show()

In [None]:
# median word counts among sentiments

sns.barplot(df,x='category',y='word_count',estimator='median')

## **Advance Preprocessing**

Installing nltk library (natural language toolkit)

In [None]:
!pip install nltk

**Checking for the stopwords**\
Creating a new column to check number of stopwords in a comment

In [None]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Define the list of English stopwords
stop_words = set(stopwords.words('english'))

# Create a new col 'num_stop_words' by counting the number of stopwords in each comment
df['num_stop_words'] = df['clean_comment'].apply(lambda x: len([word for word in x.split() if word in stop_words]))

In [None]:
df.sample(5)

**Plotting the stopwords frequency**

In [None]:
# Create a distribution plot (displot) for the 'num_stop_words' column
plt.figure(figsize=(10, 6))
sns.histplot(df['num_stop_words'], kde=True)
plt.title('Distribution of Stop Word Count in Comments')
plt.xlabel('Number of Stop Words')
plt.ylabel('Frequency')
plt.show()

stopwords by the category

In [None]:
# Create the figure and axes
plt.figure(figsize=(10, 6))

# Plot KDE for category 1
sns.kdeplot(df[df['category'] == 1]['num_stop_words'], label='Positive', fill=True)

# Plot KDE for category 0
sns.kdeplot(df[df['category'] == 0]['num_stop_words'], label='Neutral', fill=True)

# Plot KDE for category -1
sns.kdeplot(df[df['category'] == -1]['num_stop_words'], label='Negative', fill=True)

# Add title and labels
plt.title('Num stop words Distribution by Category')
plt.xlabel('Stop word count')
plt.ylabel('Density')

# Add a legend
plt.legend()

# Show the plot
plt.show()

In [None]:
# median word counts among sentiments

sns.barplot(df,x='category',y='num_stop_words',estimator='median')

Top 25 stopwords by frequency

In [None]:
# Create a frequency distribution of stop words in the 'clean_comment' column
from collections import Counter

# Extract all stop words from the comments using the previously defined 'common_stopwords'
all_stop_words = [word for comment in df['clean_comment'] for word in comment.split() if word in stop_words]

# Count the most common stop words
most_common_stop_words = Counter(all_stop_words).most_common(25)

# Convert the most common stop words to a DataFrame for plotting
top_25_df = pd.DataFrame(most_common_stop_words, columns=['stop_word', 'count'])

# Create the barplot for the top 25 most common stop words
plt.figure(figsize=(12, 8))
sns.barplot(data=top_25_df, x='count', y='stop_word', palette='viridis')
plt.title('Top 25 Most Common Stop Words')
plt.xlabel('Count')
plt.ylabel('Stop Word')
plt.show()

Number of characters in a comment to compare the ratio of stopwords with the total

In [None]:
df['num_chars'] = df['clean_comment'].apply(len)

df.head()

In [None]:
df['num_chars'].describe()

**Number of special characters in comments**

In [None]:
from collections import Counter

# Combine all comments into one large string
all_text = ' '.join(df['clean_comment'])

# Count the frequency of each character
char_frequency = Counter(all_text)

# Convert the character frequency into a DataFrame for better display
char_frequency_df = pd.DataFrame(char_frequency.items(), columns=['character', 'frequency']).sort_values(by='frequency', ascending=False)


In [None]:
char_frequency_df['character'].values

In [None]:
char_frequency_df.tail(50)

**Number of punctuation characters in a comment**

In [None]:
# Create a new column 'num_punctuation_chars' to count punctuation characters in each comment
df['num_punctuation_chars'] = df['clean_comment'].apply(
    lambda x: sum([1 for char in x if char in '.,!?;:"\'()[]{}-'])
)

df.sample(5)

In [None]:
df['num_punctuation_chars'].describe()

This code performs a textual EDA by identifying and visualizing the top 25 most frequent pairs of consecutive words (bigrams) in a dataset. It uses CountVectorizer to filter out stop words and calculate frequencies, then renders a bar chart to highlight common themes or patterns in the text.

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Create a function to extract the top 25 bigrams
def get_top_ngrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Get the top 25 bigrams
top_25_bigrams = get_top_ngrams(df['clean_comment'], 25)

# Convert the bigrams into a DataFrame for plotting
top_25_bigrams_df = pd.DataFrame(top_25_bigrams, columns=['bigram', 'count'])

# Plot the countplot for the top 25 bigrams
plt.figure(figsize=(12, 8))
sns.barplot(data=top_25_bigrams_df, x='count', y='bigram', palette='magma')
plt.title('Top 25 Most Common Bigrams')
plt.xlabel('Count')
plt.ylabel('Bigram')
plt.show()


This code expands our analysis to trigrams, capturing three-word sequences to provide deeper context and identify recurring phrases that bigrams might miss. By adjusting the ngram_range to (3, 3), it isolates specific linguistic patterns, helping us see more complex themes within the clean_comment data.

In [None]:
# Create a function to extract the top 25 trigrams
def get_top_trigrams(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Get the top 25 trigrams
top_25_trigrams = get_top_trigrams(df['clean_comment'], 25)

# Convert the trigrams into a DataFrame for plotting
top_25_trigrams_df = pd.DataFrame(top_25_trigrams, columns=['trigram', 'count'])

# Plot the countplot for the top 25 trigrams
plt.figure(figsize=(12, 8))
sns.barplot(data=top_25_trigrams_df, x='count', y='trigram', palette='coolwarm')
plt.title('Top 25 Most Common Trigrams')
plt.xlabel('Count')
plt.ylabel('Trigram')
plt.show()

**Removing all the non-English character from the comment column**

In [None]:
# Remove non-English characters from the 'clean_comment' column
# Keeping only standard English letters, digits, and common punctuation
import re

df['clean_comment'] = df['clean_comment'].apply(lambda x: re.sub(r'[^A-Za-z0-9\s!?.,]', '', str(x)))

In [None]:
all_text = ' '.join(df['clean_comment'])

# Count the frequency of each character
char_frequency = Counter(all_text)

# Convert the character frequency into a DataFrame for better display
char_frequency_df = pd.DataFrame(char_frequency.items(), columns=['character', 'frequency']).sort_values(by='frequency', ascending=False)

char_frequency_df

In [None]:
df.head()

**Keeping some of the essential stopwords**\
 To analysis the sentiments, some stopwords like 'yes','no','but' are important. It helps us to differentiate between "I like this movie" and "I do not like this movie"

In [None]:
from nltk.corpus import stopwords

# Defining stop words but keeping essential ones for sentiment analysis
stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}

# Remove stop words from 'clean_comment' column, retaining essential ones
df['clean_comment'] = df['clean_comment'].apply(
    lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words])
)

In [None]:
df.head()

**Using Lemmatizer** \
To bring the words to the root form

In [None]:
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')

# Define the lemmatizer
lemmatizer = WordNetLemmatizer()

# Apply lemmatization to the 'clean_comment_no_stopwords' column
df['clean_comment'] = df['clean_comment'].apply(
    lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split()])
)

df.head()

**Implementing WordCloud** \
To analysis in the bigger picture about the comments


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_word_cloud(text):
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(text))
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

plot_word_cloud(df['clean_comment'])

**Most frequency data**

In [None]:
def plot_top_n_words(df, n=20):
    """Plot the top N most frequent words in the dataset."""
    # Flatten all words in the content column
    words = ' '.join(df['clean_comment']).split()

    # Get the top N most common words
    counter = Counter(words)
    most_common_words = counter.most_common(n)

    # Split the words and their counts for plotting
    words, counts = zip(*most_common_words)

    # Plot the top N words
    plt.figure(figsize=(10, 6))
    sns.barplot(x=list(counts), y=list(words))
    plt.title(f'Top {n} Most Frequent Words')
    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.show()

# Example usage
plot_top_n_words(df, n=50)

**Most occured words with category**\
words that are used in different category

In [None]:
def plot_top_n_words_by_category(df, n=20, start=0):
    """Plot the top N most frequent words in the dataset with stacked hue based on sentiment category."""
    # Flatten all words in the content column and count their occurrences by category
    word_category_counts = {}

    for idx, row in df.iterrows():
        words = row['clean_comment'].split()
        category = row['category']  # Assuming 'category' column exists for -1, 0, 1 labels

        for word in words:
            if word not in word_category_counts:
                word_category_counts[word] = { -1: 0, 0: 0, 1: 0 }  # Initialize counts for each sentiment category

            # Increment the count for the corresponding sentiment category
            word_category_counts[word][category] += 1

    # Get total counts across all categories for each word
    total_word_counts = {word: sum(counts.values()) for word, counts in word_category_counts.items()}

    # Get the top N most frequent words across all categories
    most_common_words = sorted(total_word_counts.items(), key=lambda x: x[1], reverse=True)[start:start+n]
    top_words = [word for word, _ in most_common_words]

    # Prepare data for plotting
    word_labels = top_words
    negative_counts = [word_category_counts[word][-1] for word in top_words]
    neutral_counts = [word_category_counts[word][0] for word in top_words]
    positive_counts = [word_category_counts[word][1] for word in top_words]

    # Plot the stacked bar chart
    plt.figure(figsize=(12, 8))
    bar_width = 0.75

    # Plot negative, neutral, and positive counts in a stacked manner
    plt.barh(word_labels, negative_counts, color='red', label='Negative (-1)', height=bar_width)
    plt.barh(word_labels, neutral_counts, left=negative_counts, color='gray', label='Neutral (0)', height=bar_width)
    plt.barh(word_labels, positive_counts, left=[i+j for i,j in zip(negative_counts, neutral_counts)], color='green', label='Positive (1)', height=bar_width)

    plt.xlabel('Frequency')
    plt.ylabel('Words')
    plt.title(f'Top {n} Most Frequent Words with Stacked Sentiment Categories')
    plt.legend(title='Sentiment', loc='lower right')
    plt.gca().invert_yaxis()  # Invert y-axis to show the highest frequency at the top
    plt.show()



plot_top_n_words_by_category(df, n=20)