# DATA CLEANING

In [1]:
import pandas as pd
import numpy as np
file_path = '../../data/raw/all_posts.csv'
parler = pd.read_csv(file_path)
parler.head()

Unnamed: 0,author_name,author_username,author_profile_photo,post_text,post_image,post_timestamp,post_impressions
0,Andy Ngo,@AndyNgo,https://images.parler.com/777e039d6a5341388549...,Philadelphia Police charged 7 in relation to a...,https://api.parler.com/l/GJCyQ,5 days ago,132117.0
1,Bill Kays,@rebootbill,https://images.parler.com/706f1c9d8b454b5f8108...,,,4 days ago,13.0
2,Flint Bedrock,@flintbedrock,https://images.parler.com/1f45e69f8ed145f9b7b0...,Patriots in DC are ready for tomorrow.,,5 days ago,248971.0
3,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,Let’s follow each other!,https://api.parler.com/l/jB7Lz,5 days ago,9829.0
4,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,,../../api.parler.com/l/6Ac5M.html,5 days ago,15689.0


## 1. Handle Missing Values

In [2]:
# Summary statistics for the numerical features (impressions)
summary_statistics = parler['post_impressions'].describe()
summary_statistics

count    3.259430e+05
mean     3.636239e+05
std      8.301439e+05
min      6.000000e+00
25%      3.591000e+03
50%      4.355100e+04
75%      2.476915e+05
max      3.879325e+07
Name: post_impressions, dtype: float64

In [3]:
# Checking for missing values in the dataset
missing_values_summary = parler.isnull().sum()
missing_values_summary[missing_values_summary > 0]

author_name              10538
author_username          10537
author_profile_photo     10537
post_text                69928
post_image              221839
post_timestamp           10537
post_impressions         10537
dtype: int64

In [4]:
# Removing rows with missing author information
parler = parler.dropna(subset=['author_name', 'author_username', 'author_profile_photo', 'post_timestamp'])

# Filling missing post text with an empty string
parler['post_text'] = parler['post_text'].fillna('')

# Filling missing post impressions with zero
parler['post_impressions'] = parler['post_impressions'].fillna(0)

# Checking if there are still any missing values
missing_values_after_cleaning = parler.isnull().sum()
missing_values_after_cleaning[missing_values_after_cleaning > 0]

post_image    211302
dtype: int64

## 2. Ensure Correct Data Types

In [5]:
# Checking the data types of the columns
parler.dtypes

author_name              object
author_username          object
author_profile_photo     object
post_text                object
post_image               object
post_timestamp           object
post_impressions        float64
dtype: object

## 3. Clean Text Data

In [6]:
# Importing the regular expression library
import re

# Function to clean text while preserving hashtags
def clean_text_preserve_hashtags(text):
    if pd.isnull(text):
        return text
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing special characters except #
    text = re.sub(r'[^\w\s#]', ' ', text)
    # Removing numbers
    text = re.sub(r'\d', ' ', text)
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Converting to lowercase
    text = text.lower()
    return text

# Applying the clean_text_preserve_hashtags function to the post_text column
parler['post_text'] = parler['post_text'].apply(clean_text_preserve_hashtags)

# Displaying the first few rows to verify the cleaning
parler.head()

Unnamed: 0,author_name,author_username,author_profile_photo,post_text,post_image,post_timestamp,post_impressions
0,Andy Ngo,@AndyNgo,https://images.parler.com/777e039d6a5341388549...,philadelphia police charged in relation to an ...,https://api.parler.com/l/GJCyQ,5 days ago,132117.0
1,Bill Kays,@rebootbill,https://images.parler.com/706f1c9d8b454b5f8108...,,,4 days ago,13.0
2,Flint Bedrock,@flintbedrock,https://images.parler.com/1f45e69f8ed145f9b7b0...,patriots in dc are ready for tomorrow,,5 days ago,248971.0
3,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,let s follow each other,https://api.parler.com/l/jB7Lz,5 days ago,9829.0
4,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,,../../api.parler.com/l/6Ac5M.html,5 days ago,15689.0


## 4. Cleaning the Timestamp

In [7]:
import numpy as np

# Function to convert the post_timestamp into number of days
def convert_to_days(time_str):
    if pd.isnull(time_str):
        return np.nan
    if "day" in time_str:
        return int(time_str.split()[0])
    if "week" in time_str:
        return int(time_str.split()[0]) * 7
    if "month" in time_str:
        return int(time_str.split()[0]) * 30
    if "year" in time_str:
        return int(time_str.split()[0]) * 365
    return np.nan

# Apply the function to the post_timestamp column
parler['post_timestamp'] = parler['post_timestamp'].apply(convert_to_days)

# Check the first few rows
parler.head()

Unnamed: 0,author_name,author_username,author_profile_photo,post_text,post_image,post_timestamp,post_impressions
0,Andy Ngo,@AndyNgo,https://images.parler.com/777e039d6a5341388549...,philadelphia police charged in relation to an ...,https://api.parler.com/l/GJCyQ,5,132117.0
1,Bill Kays,@rebootbill,https://images.parler.com/706f1c9d8b454b5f8108...,,,4,13.0
2,Flint Bedrock,@flintbedrock,https://images.parler.com/1f45e69f8ed145f9b7b0...,patriots in dc are ready for tomorrow,,5,248971.0
3,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,let s follow each other,https://api.parler.com/l/jB7Lz,5,9829.0
4,Name Hidden,@Private User,../../company-media.parler.com/par-default-pro...,,../../api.parler.com/l/6Ac5M.html,5,15689.0


## 5. Dealing with missing values

In [8]:
# Remove rows where "author_name" or "author_username" are missing
parler.dropna(subset=['author_name', 'author_username'], inplace=True)

# Fill missing values in the "post_text" and "post_image" columns with an empty string
parler['post_text'].fillna('', inplace=True)
parler['post_image'].fillna('', inplace=True)

# Fill missing values in the "post_impressions" column with 0
parler['post_impressions'].fillna(0, inplace=True)

# Check for any remaining missing values
missing_values_summary = parler.isnull().sum()
missing_values_summary

author_name             0
author_username         0
author_profile_photo    0
post_text               0
post_image              0
post_timestamp          0
post_impressions        0
dtype: int64

## 6. Saving cleaned data

In [9]:
parler.to_csv('../../data/cleaned/cleaned_dataset.csv', index=False)