# Data Exploration and Cleaning of Vax-Culture Dataset

**Dataset Source / Credit:**  
This dataset was obtained from [Vax-Culture: A Dataset for Studying Vaccine Discourse on Twitter](https://github.com/mrzarei5/Vax-Culture)  


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
%pip install emoji wordsegment
import emoji
from wordsegment import load, segment
df = pd.read_csv("Vax_Culture.csv")

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m15.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordsegment, emoji
Successfully installed emoji-2.14.1 wordsegment-1.3.1


## Data Exploration
First, we will take a look at the dataset to understand its structure and its issues.


In [3]:
display(df.head())

Unnamed: 0,tweet_id,meaning,communicated_message,inaccurate_or_misleadning,criticism_politicians,criticism_pharmaceutical_companies,criticism_public_health_officials,criticism_anti-vaxxers,criticism_vaccine_mandates,criticism_vaccine_safety,...,support_natural_health,support_vaccines,support_small_business,support_alternative_remedies,support_relaxed_approach,support_more_information,support_public_health_interventions,support_global_response,support_religious_beliefs,support_other
0,1466072498462732293,The tweet exaggerates the possibility that Rep...,Anti-vaccine,1,1,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,
1,1463909247826239494,The tweet is linking to a video claiming that ...,Anti-vaccine,1,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,
2,1457401803016921089,The tweet is promoting the fact that anti-vaxx...,Pro-vaccine,0,0,0,0,1,0,0,...,0,1,0,0,0,0,1,0,0,Reason
3,1459021730446458881,The tweet is linking to a inaccurate study tha...,Anti-vaccine,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,Not getting vaccinated
4,1484145644939972616,The tweet is replying to a tweet of a study ab...,Anti-vaccine,1,0,0,0,0,0,1,...,1,0,0,0,0,0,0,0,0,


In [4]:
print("\nDataset info:")
df.info()


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6373 entries, 0 to 6372
Data columns (total 29 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   tweet_id                             6373 non-null   int64 
 1   meaning                              6373 non-null   object
 2   communicated_message                 6373 non-null   object
 3   inaccurate_or_misleadning            6373 non-null   int64 
 4   criticism_politicians                6373 non-null   int64 
 5   criticism_pharmaceutical_companies   6373 non-null   int64 
 6   criticism_public_health_officials    6373 non-null   int64 
 7   criticism_anti-vaxxers               6373 non-null   int64 
 8   criticism_vaccine_mandates           6373 non-null   int64 
 9   criticism_vaccine_safety             6373 non-null   int64 
 10  criticism_conservative_media         6373 non-null   int64 
 11  criticism_mainstream_media  

In [5]:
# number of duplicates
print("\nNumber of duplicate rows:", df.duplicated().sum())


Number of duplicate rows: 39


In [6]:
# missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
tweet_id                                  0
meaning                                   0
communicated_message                      0
inaccurate_or_misleadning                 0
criticism_politicians                     0
criticism_pharmaceutical_companies        0
criticism_public_health_officials         0
criticism_anti-vaxxers                    0
criticism_vaccine_mandates                0
criticism_vaccine_safety                  0
criticism_conservative_media              0
criticism_mainstream_media                0
criticism_public_health_policy            0
criticism_democrats_or_liberals           0
criticism_government                      0
criticism_vaccine_effectiveness           0
criticism_other                        4501
support_science                           0
support_choice_freedom                    0
support_natural_health                    0
support_vaccines                          0
support_small_business                    0
supp

In [7]:
# unique values
print("\nUnique values per column:")
for col in df.columns:
    print(f"{col}: {df[col].nunique()} unique values")


Unique values per column:
tweet_id: 6326 unique values
meaning: 6331 unique values
communicated_message: 3 unique values
inaccurate_or_misleadning: 2 unique values
criticism_politicians: 2 unique values
criticism_pharmaceutical_companies: 2 unique values
criticism_public_health_officials: 2 unique values
criticism_anti-vaxxers: 2 unique values
criticism_vaccine_mandates: 2 unique values
criticism_vaccine_safety: 2 unique values
criticism_conservative_media: 2 unique values
criticism_mainstream_media: 2 unique values
criticism_public_health_policy: 2 unique values
criticism_democrats_or_liberals: 2 unique values
criticism_government: 2 unique values
criticism_vaccine_effectiveness: 2 unique values
criticism_other: 1158 unique values
support_science: 2 unique values
support_choice_freedom: 2 unique values
support_natural_health: 2 unique values
support_vaccines: 2 unique values
support_small_business: 2 unique values
support_alternative_remedies: 2 unique values
support_relaxed_approach

In [10]:
# text length
df["text_length"] = df["meaning"].apply(lambda x: len(str(x).split()))
print("\nTweet length statistics (in words):")
print(df["text_length"].describe())


Tweet length statistics (in words):
count    6373.000000
mean       30.726816
std        11.937175
min         6.000000
25%        22.000000
50%        29.000000
75%        37.000000
max       115.000000
Name: text_length, dtype: float64


In [12]:
df[df["text_length"] > 100][["meaning"]].head(10)

Unnamed: 0,meaning
427,This post clearly supports anti-vaccine sentim...
768,This post promotes natural immunity to COVID-1...
1053,This tweet posts an article featuring Martin K...
4831,"Tweet appears to be a right-wing comedian ""tro..."


## Data Cleaning

In the previous step, we noticed that the dataset has some duplicate rows and a few tweets that are unusually long compared to normal tweet length. So now, in the data cleaning step, we will remove duplicates and filter out these extreme outliers as well as apply other data cleaning techniques.

In [14]:
# remove empty rows
df = df.dropna(subset=['meaning'])
df = df[df['meaning'].str.strip() != '']

In [15]:
df = df.drop('criticism_other', axis=1)
df = df.drop('support_other', axis=1)
df = df.drop('tweet_id', axis=1)

In [16]:
# remove very long tweets
df = df[df["text_length"] <= 100]

# drop duplicates
df = df.drop_duplicates()

In [17]:
print("\nNumber of duplicate rows:", df.duplicated().sum())


Number of duplicate rows: 0


In [23]:
df[df["text_length"] > 100][["meaning"]].head(10)

Unnamed: 0,meaning


In the next step we clean tweets by removing URLs, replacing mentions with @USER, and splitting hashtags into words so their meaning is preserved. Emojis are converted to text, and important punctuation is kept to maintain tone and sentiment. This ensures the model captures bullying cues without losing context.

In [25]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"@\w+", "@USER", text)
    hashtags = re.findall(r"#\w+", text)
    for tag in hashtags:
        try:
            words = " ".join(segment(tag[1:]))
            if words:
                text = text.replace(tag, words)
            else:
                text = text.replace(tag, tag[1:])
        except:
             text = text.replace(tag, tag[1:])

    text = emoji.demojize(text, delimiters=(" ", " "))
    text = re.sub(r"[^a-z\s!?.]", "", text)
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# apply cleaning
df["clean_text"] = df["meaning"].apply(clean_text)

In [26]:
%pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━[0m [32m757.8/981.5 kB[0m [31m22.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=6b1047e086df16afc494f523acf1a8bf00b669913a8eabb3ab4f8cee1ad8436f
  Stored in directory: /root/.cache/pip/wheels/c1/67/88/e844b5b022812e15a52e4eaa38a1e709e99f06f6639d7e3ba7
Successfully built langdetect
Installing collected packages: langdetect
Successfully installed langdetect-1.0.9


In [28]:
# remove non english tweets
from langdetect import detect, LangDetectException

def is_english(text):
    try:
        if text.strip():
            return detect(text) == 'en'
        else:
            return False
    except LangDetectException:
        return False

df = df[df['meaning'].apply(is_english)]

In [29]:
df = df.reset_index(drop=True)

In [30]:
df = df.drop(columns=['meaning'])
df = df.drop(columns=['text_length'])
df = df.rename(columns={'clean_text': 'tweet_meaning'})

In [31]:
df.to_csv("cleaned_vax_tweets.csv", index=False)