## Importing Libraries and Loading Data

In [2]:
# pip install textblob

Collecting textblob
  Downloading textblob-0.18.0.post0-py3-none-any.whl (626 kB)
     ---------------------------------------- 0.0/626.3 kB ? eta -:--:--
     --------------------- --------------- 368.6/626.3 kB 11.6 MB/s eta 0:00:01
     -------------------------------------- 626.3/626.3 kB 9.8 MB/s eta 0:00:00
Installing collected packages: textblob
Successfully installed textblob-0.18.0.post0
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.0.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from textblob import TextBlob  # For sentiment analysis
import matplotlib.pyplot as plt

In [2]:
# Load the dataset
file_path = 'redditSubmissions.csv.gz'
df = pd.read_csv(file_path, on_bad_lines='skip')

## Advanced Data Preprocessing

In [3]:
# Convert 'unixtime' to datetime
df['datetime'] = pd.to_datetime(df['unixtime'], unit='s')

# Feature Engineering
# Extracting more time-related features
df['hour'] = df['datetime'].dt.hour
df['day_of_week'] = df['datetime'].dt.dayofweek
df['month'] = df['datetime'].dt.month
df['year'] = df['datetime'].dt.year

# Text Analysis - Simple sentiment analysis on titles
def sentiment_score(text):
    try:
        return TextBlob(str(text)).sentiment.polarity
    except:
        return 0  # Return a neutral sentiment score for non-string or problematic inputs

df['title_sentiment'] = df['title'].apply(sentiment_score)


df['title_sentiment'] = df['title'].apply(sentiment_score)

# Handling Missing Data - Impute missing values if any
df.fillna(df.mean(numeric_only=True), inplace=True)

# Feature Scaling - Using Robust Scaler for numerical features
num_features = ['hour', 'day_of_week', 'total_votes', 'number_of_comments', 'month', 'year', 'title_sentiment']
cat_features = ['subreddit']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', RobustScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)

  arr, tz_parsed = tslib.array_with_unit_to_datetime(arg, unit, errors=errors)


## Splitting Dataset into Training and Testing Sets

In [4]:
X = df[num_features + cat_features]
y = df['score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Dimensionality Reduction with Truncated SVD

In [5]:
from sklearn.decomposition import TruncatedSVD

# Update the pipeline to use TruncatedSVD for dimensionality reduction
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('dim_reduction', TruncatedSVD(n_components=100)),  # Adjust n_components as needed
                           ('regressor', LinearRegression())])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Linear Regression with TruncatedSVD:")
print("Mean Squared Error (MSE):", mse)
print("R-squared (R²):", r2)

Linear Regression with TruncatedSVD:
Mean Squared Error (MSE): 75489.00418345224
R-squared (R²): 0.6615461858534217


## Random Forest

In [6]:
from sklearn.ensemble import RandomForestRegressor

# Using a more complex model - RandomForestRegressor
# Updating the model in the pipeline with RandomForestRegressor
complex_model = Pipeline(steps=[('preprocessor', preprocessor),
                                ('regressor', RandomForestRegressor(random_state=42))])

# Training the complex model
complex_model.fit(X_train, y_train)

# Making predictions and evaluating the complex model
y_pred_complex = complex_model.predict(X_test)
mse_complex = mean_squared_error(y_test, y_pred_complex)
r2_complex = r2_score(y_test, y_pred_complex)

mse_complex, r2_complex

(18756.7913124891, 0.915904208440268)

## Gradient Boosting Regressors (XGBoost)

In [7]:
from xgboost import XGBRegressor

# Update the model in the pipeline with XGBoostRegressor
xgb_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', XGBRegressor(random_state=42))])

# Training the XGBoost model
xgb_model.fit(X_train, y_train)

# Making predictions and evaluating the XGBoost model
y_pred_xgb = xgb_model.predict(X_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Regressor:")
print("Mean Squared Error (MSE):", mse_xgb)
print("R-squared (R²):", r2_xgb)


XGBoost Regressor:
Mean Squared Error (MSE): 17767.333704999754
R-squared (R²): 0.920340426732104


## Support Vector Regression (SVR) 

In [8]:
from sklearn.svm import SVR

# Update the model in the pipeline with SVR
svr_model = Pipeline(steps=[('preprocessor', preprocessor),
                            ('regressor', SVR(kernel='linear'))])  # You can try different kernels (linear, poly, rbf, etc.)

# Training the SVR model
svr_model.fit(X_train, y_train)

# Making predictions and evaluating the SVR model
y_pred_svr = svr_model.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
r2_svr = r2_score(y_test, y_pred_svr)

print("Support Vector Regression (SVR):")
print("Mean Squared Error (MSE):", mse_svr)
print("R-squared (R²):", r2_svr)

Support Vector Regression (SVR):
Mean Squared Error (MSE): 93244.48141239984
R-squared (R²): 0.5819397709174656
