# Data Preprocessing for Reddit Dataset

This notebook demonstrates data preprocessing steps including handling missing values, data transformation, normalization, encoding, and feature engineering for predictive modeling.


In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime


In [2]:
# Load the dataset
file_path = 'Reddit.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info(), data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5374 entries, 0 to 5373
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   subreddit      5374 non-null   object
 1   title          5369 non-null   object
 2   score          5374 non-null   int64 
 3   id             5374 non-null   object
 4   url            5374 non-null   object
 5   comments       5374 non-null   int64 
 6   creation time  5374 non-null   object
 7   content        4010 non-null   object
dtypes: int64(2), object(6)
memory usage: 336.0+ KB


(None,
              subreddit                                           title  score  \
 0   narcissisticinlaws                      in laws encourage bullying      1   
 1  narcissisticspouses                             hidden tracker apps      2   
 2  narcissisticparents  narc mother terminal illness less than a month      3   
 3  narcissisticmothers                is it fair for my mom to do this      7   
 4  narcissisticmothers          parents withdrawing financial support       4   
 
         id                                                url  comments  \
 0  16bjmj1  httpswwwredditcomrnarcissisticinlawscomments16...         0   
 1  1ddnqjb  httpswwwredditcomrnarcissisticspousescomments1...         1   
 2  1dg697r  httpswwwredditcomrnarcissisticparentscomments1...         1   
 3  1cxqpin  httpswwwredditcomrnarcissisticmotherscomments1...         4   
 4  1bqa2r6  httpswwwredditcomrnarcissisticmotherscomments1...         3   
 
      creation time                      

In [3]:
# Handling missing values
data['title'].fillna('No Title', inplace=True)
data['content'].fillna('No Content', inplace=True)


In [4]:
# Data transformation: converting 'creation time' to datetime
data['creation time'] = pd.to_datetime(data['creation time'], format='%Y%m%d %H%M%S')

# Extracting additional features from 'creation time'
data['year'] = data['creation time'].dt.year
data['month'] = data['creation time'].dt.month
data['day'] = data['creation time'].dt.day
data['hour'] = data['creation time'].dt.hour
data['minute'] = data['creation time'].dt.minute
data['second'] = data['creation time'].dt.second


In [5]:
# Normalization of numerical columns
scaler = StandardScaler()
data[['score', 'comments']] = scaler.fit_transform(data[['score', 'comments']])


In [6]:
# Encoding categorical variables
encoder = LabelEncoder()
data['subreddit'] = encoder.fit_transform(data['subreddit'])


In [7]:
# Dropping columns that may not be necessary for predictive modeling
data.drop(columns=['id', 'url', 'creation time'], inplace=True)

# Display the preprocessed data
data.info(), data.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5374 entries, 0 to 5373
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   subreddit  5374 non-null   int32  
 1   title      5374 non-null   object 
 2   score      5374 non-null   float64
 3   comments   5374 non-null   float64
 4   content    5374 non-null   object 
 5   year       5374 non-null   int32  
 6   month      5374 non-null   int32  
 7   day        5374 non-null   int32  
 8   hour       5374 non-null   int32  
 9   minute     5374 non-null   int32  
 10  second     5374 non-null   int32  
dtypes: float64(2), int32(7), object(2)
memory usage: 315.0+ KB


(None,
    subreddit                                           title     score  \
 0         13                      in laws encourage bullying -0.490332   
 1         22                             hidden tracker apps -0.413823   
 2         18  narc mother terminal illness less than a month -0.337314   
 3         16                is it fair for my mom to do this -0.031278   
 4         16          parents withdrawing financial support  -0.260805   
 
    comments                                            content  year  month  \
 0 -0.483803  my inlaws have changed there parenting per chi...  2023      9   
 1 -0.413619  i think he is tracking me and im not sure if i...  2024      6   
 2 -0.413619  okay in short my narc mother had a brain tumor...  2024      6   
 3 -0.203068  my mom doesnt like the way i dress and is now ...  2024      5   
 4 -0.273252  turned 18 in january have been  bombarded with...  2024      3   
 
    day  hour  minute  second  
 0    6    12      57      