Incomplete

In [None]:
import pandas as pd
import nltk as nltk
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from collections import Counter
from nltk.corpus import stopwords
from nltk.corpus import wordnet
nltk.download('wordnet')
import re

In [None]:
# Load train.csv
train_data = pd.read_csv("train.csv")

# Display the first few rows to understand the structure
train_data.head()

In [None]:
# Basic Info
print("Dataset Info:")
print(train_data.info())
print("\nSample Data:")

In [None]:
# Label Distribution
label_cols = ["toxic", "insult", "obscene", "threat", "identity_hate"]  # Update with your label columns
label_counts = train_data[label_cols].sum()
print("\nLabel Distribution:")
print(label_counts)

In [None]:
def preprocess_text_eda(text):
    """
    A simple preprocessing function for EDA.
    Retains as much context as possible while cleaning noise.
    """
    # Convert to lowercase to make analysis case-insensitive
    text = text.lower()

    # Replace URLs with a placeholder
    text = re.sub(r"http\S+|www\S+|https\S+", "<URL>", text, flags=re.MULTILINE)

    # Replace user mentions (e.g., @username) with a placeholder
    text = re.sub(r"@\w+", "<USER>", text)

    # Replace hashtags (#topic) with the word itself
    text = re.sub(r"#(\w+)", r"\1", text)

    # Remove extra whitespaces
    text = re.sub(r"\s+", " ", text).strip()

    return text



In [None]:
# Preprocess the data for EDA
train_data['cleaned_comment'] = train_data['comment_text'].apply(preprocess_text_eda)
print(train_data[['comment_text', 'cleaned_comment']].head())


In [None]:
# Original vs. cleaned comment lengths
train_data['original_length'] = train_data['comment_text'].apply(len)
train_data['cleaned_length'] = train_data['cleaned_comment'].apply(len)

print(train_data[['original_length', 'cleaned_length']].describe())

In [None]:
# Dataset overview
print(train_data.info())

# Distribution of comment lengths
train_data['cleaned_length'] = train_data['cleaned_comment'].apply(len)
train_data['word_count'] = train_data['cleaned_comment'].apply(lambda x: len(x.split()))
print("\nSummary of cleaned comment lengths:")
print(train_data['cleaned_length'].describe())


In [None]:
# Distribution of comment lengths
plt.figure(figsize=(12, 6))
sns.histplot(train_data['cleaned_length'], bins=50, kde=True, color='blue')
plt.title("Distribution of Cleaned Comment Lengths")
plt.xlabel("Comment Length (characters)")
plt.ylabel("Frequency")
plt.show()

# Distribution of word counts
plt.figure(figsize=(12, 6))
sns.histplot(train_data['word_count'], bins=50, kde=True, color='green')
plt.title("Distribution of Word Counts in Cleaned Comments")
plt.xlabel("Word Count")
plt.ylabel("Frequency")
plt.show()

In [None]:

def preprocess_negative_comments(text):
    """
    Additional preprocessing for negative comments wordcloud
    """
    text = text.lower()
    text = re.sub(r"\s+", " ", text)  # Remove extra whitespaces
    text = re.sub(r"(.)\1{2,}", r"\1", text)  # Normalize repeated characters ("soooo" -> "so")
    text = re.sub(r"\bn+ig+e*r+\b", "nigger", text)  # Normalize variations of offensive terms
    text = re.sub(r"\bf+u+c*k+\b", "fuck", text)  # Standardize curse words
    text = re.sub(r"[^\w\s]", "", text)  # Remove remaining punctuations
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    text = ' '.join(filtered_words)

    return text



In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import Counter

# Define negative comments
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_data["negative_comment"] = train_data[label_columns].max(axis=1)

# Split into negative and non-negative groups
negative_comments = train_data[train_data["negative_comment"] == 1].copy()
non_negative_comments = train_data[train_data["negative_comment"] == 0].copy()

# Explicitly create a copy of the subset to avoid the SettingWithCopyWarning
negative_comments = negative_comments.copy()

# Apply preprocessing to comments
negative_comments["cleaned_comment"] = negative_comments["cleaned_comment"].apply(preprocess_negative_comments)
non_negative_comments["cleaned_comment"] = non_negative_comments["cleaned_comment"].apply(preprocess_negative_comments)

# Combine all cleaned comments into a single string for each category
negative_text = " ".join(negative_comments["cleaned_comment"].dropna())
non_negative_text = " ".join(non_negative_comments["cleaned_comment"].dropna())

# Count word frequencies
negative_word_counts = Counter(negative_text.split())
non_negative_word_counts = Counter(non_negative_text.split())

# Debug: Inspect word frequencies
print("Top 20 words in negative comments:", negative_word_counts.most_common(20))
print("Top 20 words in non-negative comments:", non_negative_word_counts.most_common(20))

# Generate word clouds from frequencies
wordcloud_negative = WordCloud(
    width=800, height=400, background_color="black", colormap="Reds"
).generate_from_frequencies(negative_word_counts)

wordcloud_non_negative = WordCloud(
    width=800, height=400, background_color="black", colormap="Greens"
).generate_from_frequencies(non_negative_word_counts)

# Plot Word Clouds
plt.figure(figsize=(16, 8))

# Negative comments word cloud
plt.subplot(1, 2, 1)
plt.imshow(wordcloud_negative, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Negative Comments", fontsize=16)

# Non-negative comments word cloud
plt.subplot(1, 2, 2)
plt.imshow(wordcloud_non_negative, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Non-Negative Comments", fontsize=16)

plt.tight_layout()
plt.show()


In [None]:
# Label distribution
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
label_distribution = train_data[label_cols].sum().sort_values(ascending=False)
print("\nLabel Distribution:")
print(label_distribution)

# Visualize label distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=label_distribution.index, y=label_distribution.values, palette="viridis")
plt.title("Label Distribution")
plt.ylabel("Frequency")
plt.xlabel("Labels")
plt.show()

# Relationship between labels and comment lengths
plt.figure(figsize=(12, 6))
sns.boxplot(data=train_data, x='negative_comment', y='cleaned_length', palette="coolwarm")
plt.title("Comment Length by Negativity")
plt.xlabel("Negativity (0 = Non-Negative, 1 = Negative)")
plt.ylabel("Comment Length (characters)")
plt.show()


In [None]:
# Compute correlations
label_corr = train_data[label_cols].corr()

# Heatmap for label correlations
plt.figure(figsize=(10, 8))
sns.heatmap(label_corr, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Between Labels")
plt.show()


In [None]:
# Extremely short comments
short_comments = train_data[train_data['cleaned_length'] < 10]
print("\nExamples of very short comments:")
print(short_comments[['cleaned_comment', 'toxic', 'obscene', 'insult']].head())

# Extremely long comments
long_comments = train_data[train_data['cleaned_length'] > 500]
print("\nExamples of very long comments:")
print(long_comments[['cleaned_comment', 'toxic', 'obscene', 'insult']].head())


In [None]:
# Select label columns
label_columns = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

# Compute co-occurrence matrix
co_occurrence = train_data[label_columns].T.dot(train_data[label_columns])

# Normalize the matrix to get proportions (optional)
co_occurrence_normalized = co_occurrence / len(train_data)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(co_occurrence, annot=True, fmt="d", cmap="Blues")
plt.title("Label Co-Occurrence Matrix")
plt.show()

In [None]:
# Count unique combinations of labels
train_data["label_combination"] = train_data[label_columns].apply(lambda row: tuple(row), axis=1)
combination_counts = train_data["label_combination"].value_counts()

# Display the top 10 most frequent combinations
print("Top 10 most common label combinations:")
print(combination_counts.head(10))
