# 1. DATA INSPECTION AND QUALITY CHECK

In [19]:
# 1. Data Inspection & Quality Check

import pandas as pd

# Load dataset from the data folder
df = pd.read_csv("data/train_data.csv")

# Basic information
print("Dataset shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)

# Missing values
print("\nMissing values per column:")
print(df.isnull().sum())

# Duplicate rows
duplicate_count = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicate_count}")

# Empty text values (non-null but blank)
empty_text_count = df['comment_text'].apply(lambda x: str(x).strip() == '').sum()
print(f"Empty text entries: {empty_text_count}")


Dataset shape: (159571, 8)

Columns: ['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

Data types:
 id               object
comment_text     object
toxic             int64
severe_toxic      int64
obscene           int64
threat            int64
insult            int64
identity_hate     int64
dtype: object

Missing values per column:
id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

Number of duplicate rows: 0
Empty text entries: 0


**Observations**

- The dataset is **clean and well-structured**.  
- There are **no missing, duplicate, or empty entries**, indicating reliable data quality.  
- Each record includes an `id`, the user’s comment, and six binary toxicity labels.  
- The dataset is ready for exploratory and statistical analysis.

-------------------------------------------------------------------



# 2. LABEL DISTRIBUTION AND CLASS BALANCE ANALYSIS

In [20]:
# 2. Label Distribution & Class Balance Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Select only label columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Count total toxic labels per column
label_counts = df[label_cols].sum().sort_values(ascending=False)

print("Label counts:\n", label_counts)


# Calculate percentage of non-toxic vs toxic comments
non_toxic = (df[label_cols].sum(axis=1) == 0).sum()
toxic = df.shape[0] - non_toxic
print(f"\nNon-toxic comments: {non_toxic} ({non_toxic/df.shape[0]*100:.2f}%)")
print(f"Toxic comments: {toxic} ({toxic/df.shape[0]*100:.2f}%)")



Label counts:
 toxic            15294
obscene           8449
insult            7877
severe_toxic      1595
identity_hate     1405
threat             478
dtype: int64

Non-toxic comments: 143346 (89.83%)
Toxic comments: 16225 (10.17%)


**Observations**

- The dataset is **highly imbalanced**, with almost **90% non-toxic comments**.  
- Among toxic categories, the **"toxic"** label is most common, followed by *obscene* and *insult*.  
- **Severe_toxic**, **identity_hate**, and **threat** are rare categories, each below 1%.  
- The class imbalance indicates that later stages may need **sampling strategies** or **weighted loss functions** to prevent the model from being biased toward non-toxic examples.

------------------------------------------------------------------------


# 3. COMMENT LENGTH AND TEXT ANALYSIS

In [21]:
# 3. Comment Length & Text Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Compute lengths
df["char_length"] = df["comment_text"].apply(len)
df["word_count"] = df["comment_text"].apply(lambda x: len(str(x).split()))

# Summary statistics
print("Character length summary:\n", df["char_length"].describe())
print("\nWord count summary:\n", df["word_count"].describe())

# Compare average length by toxicity
toxic_comments = df[df["toxic"] == 1]
non_toxic_comments = df[df["toxic"] == 0]

avg_len_toxic = toxic_comments["word_count"].mean()
avg_len_nontoxic = non_toxic_comments["word_count"].mean()

print(f"\nAverage word count - Toxic: {avg_len_toxic:.2f}")
print(f"Average word count - Non-toxic: {avg_len_nontoxic:.2f}")

Character length summary:
 count    159571.000000
mean        394.073221
std         590.720282
min           6.000000
25%          96.000000
50%         205.000000
75%         435.000000
max        5000.000000
Name: char_length, dtype: float64

Word count summary:
 count    159571.000000
mean         67.273527
std          99.230702
min           1.000000
25%          17.000000
50%          36.000000
75%          75.000000
max        1411.000000
Name: word_count, dtype: float64

Average word count - Toxic: 51.29
Average word count - Non-toxic: 68.97


**Observations**

- The average comment length is around **67 words** (≈ 394 characters).  
- The distribution is **right-skewed**, meaning most comments are short, with a few long ones up to 1400+ words.  
- Interestingly, **non-toxic comments are longer on average** than toxic ones, suggesting that toxic messages are often shorter and more direct.  
- No extreme anomalies were detected — the length statistics align with typical online comment data.

--------------------------------------------------------------------


# 4. WORD-LEVEL EXPLORATION

In [22]:
# 4. Word-Level Exploration (No Visuals)

from collections import Counter
import re

def clean_text(text):
    """Basic text cleaning: lowercase, remove punctuation and numbers."""
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text

# Apply basic cleaning
df['clean_text'] = df['comment_text'].apply(clean_text)

# Separate toxic and non-toxic comments
toxic_texts = " ".join(df[df["toxic"] == 1]["clean_text"])
nontoxic_texts = " ".join(df[df["toxic"] == 0]["clean_text"])

# Count top words
toxic_words = Counter(toxic_texts.split()).most_common(20)
nontoxic_words = Counter(nontoxic_texts.split()).most_common(20)

print("Top 20 words in TOXIC comments:")
for word, count in toxic_words:
    print(f"{word:<15} {count}")

print("\nTop 20 words in NON-TOXIC comments:")
for word, count in nontoxic_words:
    print(f"{word:<15} {count}")


Top 20 words in TOXIC comments:
you             35329
the             20451
a               20373
i               19902
to              15655
and             15647
is              12630
of              11386
your            9114
fuck            8617
are             8153
that            7736
it              6831
in              6663
my              6170
this            5432
me              5390
on              4858
not             4809
for             4705

Top 20 words in NON-TOXIC comments:
the             475029
to              281196
of              212638
and             206721
a               194537
i               180753
you             169231
is              163330
that            146565
in              137525
it              122812
for             97740
this            91649
not             88529
on              84589
be              79756
as              74235
have            67564
are             63723
if              55660


**Observations**
- Both toxic and non-toxic comments contain common English stopwords (e.g., *the*, *to*, *and*), which is typical for natural text.  
- The toxic comments, however, include **aggressive or explicit vocabulary** such as *“fuck”* and possessive tones (*your*, *you*, *my*).  
- This indicates that toxic messages often use **direct, confrontational language** targeting individuals.  
- Removing stopwords and analyzing remaining tokens later will help isolate **discriminative words** for model training and interpretability.

-------------------------------------------------------------------------


# 5. LABEL CORRELATION ANALYSIS

In [23]:
# 5. Label Correlation Analysis

import matplotlib.pyplot as plt
import seaborn as sns

# Select label columns
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# Compute correlation matrix
corr_matrix = df[label_cols].corr()

# Display numerical correlation
print("Label Correlation Matrix:\n")
print(corr_matrix.round(3))


Label Correlation Matrix:

               toxic  severe_toxic  obscene  threat  insult  identity_hate
toxic          1.000         0.309    0.677   0.157   0.648          0.266
severe_toxic   0.309         1.000    0.403   0.124   0.376          0.202
obscene        0.677         0.403    1.000   0.141   0.741          0.287
threat         0.157         0.124    0.141   1.000   0.150          0.115
insult         0.648         0.376    0.741   0.150   1.000          0.338
identity_hate  0.266         0.202    0.287   0.115   0.338          1.000


**Observations**

- The **strongest correlations** are between:
  - `obscene` and `insult` (**0.74**)  
  - `toxic` and `obscene` (**0.68**)  
  - `toxic` and `insult` (**0.65**)  
- These relationships indicate that many comments marked as *obscene* or *insult* are also labeled *toxic*.  
- `severe_toxic` shows a **moderate correlation** with `toxic` (**0.31**) — expected, as it’s a more extreme subset.  
- `threat` has **weak correlations** with all other labels, confirming it represents a distinct minority category.  
- `identity_hate` also has relatively low correlations, suggesting it captures a unique type of targeted toxicity.

**Inference**

The labels are **not independent**, which supports treating this as a **multi-label classification problem** rather than multiple single-label tasks.  
This correlation insight can also guide **loss weighting**, **label grouping**, or **hierarchical modeling** in later stages.

------------------------------------------------------------------------



# 6. DATA QUALITY AND CLEANING CHECKS

In [24]:
# 6. Data Quality & Cleaning Checks

import re

# Helper regex patterns
url_pattern = r'http[s]?://\S+|www\.\S+'
html_pattern = r'<.*?>'
emoji_pattern = r'[\U00010000-\U0010ffff]'
non_ascii_pattern = r'[^\x00-\x7F]+'

# Check for presence of patterns
df['has_url'] = df['comment_text'].str.contains(url_pattern, regex=True)
df['has_html'] = df['comment_text'].str.contains(html_pattern, regex=True)
df['has_emoji'] = df['comment_text'].str.contains(emoji_pattern, regex=True)
df['has_non_ascii'] = df['comment_text'].str.contains(non_ascii_pattern, regex=True)

# Summary
print("Text Cleaning Indicators:\n")
print(df[['has_url', 'has_html', 'has_emoji', 'has_non_ascii']].sum())

# Calculate proportions
total_rows = df.shape[0]
print("\nProportion of affected rows:")
print((df[['has_url', 'has_html', 'has_emoji', 'has_non_ascii']].sum() / total_rows * 100).round(3))


Text Cleaning Indicators:

has_url           5104
has_html           129
has_emoji           98
has_non_ascii    17215
dtype: int64

Proportion of affected rows:
has_url           3.199
has_html          0.081
has_emoji         0.061
has_non_ascii    10.788
dtype: float64


**Observations**

- A small number of comments (**~3.2%**) contain embedded **URLs**, likely referencing web content or sources.  
- **HTML tags** and **emojis** are rare, appearing in less than 0.1% of the data.  
- Around **10.8%** of comments contain **non-ASCII characters**, indicating possible foreign language text, special symbols, or encoding artifacts.  
- Overall, text noise is **minimal**, but basic preprocessing (URL and non-ASCII removal or normalization) will improve model consistency.

**Inference**

The dataset is generally clean but would benefit from:
- Removing URLs and HTML tags.  
- Normalizing or stripping non-ASCII text where appropriate.  
- Keeping emoji information optional (if sentiment cues are relevant).  

This ensures consistent text formatting for tokenization and feature extraction during model training.
