# Data Preprocessing for Reddit Dataset

This notebook demonstrates data preprocessing steps including handling missing values, data transformation, normalization, encoding, and feature engineering for predictive modeling.


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime


In [None]:
# Load the dataset
file_path = 'path_to_your_file/Reddit.csv'
data = pd.read_csv(file_path)

# Display basic information about the dataset
data.info(), data.head()


In [None]:
# Handling missing values
data['title'].fillna('No Title', inplace=True)
data['content'].fillna('No Content', inplace=True)


In [None]:
# Data transformation: converting 'creation time' to datetime
data['creation time'] = pd.to_datetime(data['creation time'], format='%Y%m%d %H%M%S')

# Extracting additional features from 'creation time'
data['year'] = data['creation time'].dt.year
data['month'] = data['creation time'].dt.month
data['day'] = data['creation time'].dt.day
data['hour'] = data['creation time'].dt.hour
data['minute'] = data['creation time'].dt.minute
data['second'] = data['creation time'].dt.second


In [None]:
# Normalization of numerical columns
scaler = StandardScaler()
data[['score', 'comments']] = scaler.fit_transform(data[['score', 'comments']])


In [None]:
# Encoding categorical variables
encoder = LabelEncoder()
data['subreddit'] = encoder.fit_transform(data['subreddit'])


In [None]:
# Dropping columns that may not be necessary for predictive modeling
data.drop(columns=['id', 'url', 'creation time'], inplace=True)

# Display the preprocessed data
data.info(), data.head()
